[PATCH, rs6000] Add handling for UNSPEC_VSPLT_DIRECT to analyze_swaps

Bill Schmidt wschmidt@linux.vnet.ibm.com
Sat Sep 6 17:50:00 GMT 2014


Hi,

Here's one more case of special handling that allows us to optimize more
vectorized loops in analyze_swaps.  UNSPEC_VSPLT_DIRECT is used in some
cases to avoid the possibility of an endian fixup.  We can still handle
this by swapping the lane chosen as the source of the splat.

While implementing this I realized that I had had a thinko with the
adjust_extract changes in the last related patch.  When swapping
doublewords, the right change is to add or subtract n_elts/2, not to
subtract from n_elts-1.  I've corrected that issue herein as well.

I've added a new test to demonstrate the UNSPEC_VSPLT_DIRECT case.

Bootstrapped and tested on powerpc64le-unknown-linux-gnu with no
regressions.  Ok for trunk?

Thanks,
Bill


[gcc]

2014-09-06  Bill Schmidt  <wschmidt@linux.vnet.ibm.com>

	* config/rs6000/rs6000.c (special_handling_values):  Add SH_SPLAT.
	(rtx_is_swappable_p): Convert UNSPEC cascading ||s to a switch
	statement; allow optimization of UNSPEC_VSPLT_DIRECT with special
	handling SH_SPLAT.
	(adjust_extract): Fix test for VEC_DUPLICATE case; fix adjustment
	of extracted lane.
	(adjust_splat): New function.
	(handle_special_swappables): Call adjust_splat for SH_SPLAT.
	(dump_swap_insn_table): Add case for SH_SPLAT.

[gcc/testsuite]

2014-09-06  Bill Schmidt  <wschmidt@linux.vnet.ibm.com>

	* gcc.target/powerpc/swaps-p8-16.c: New test.


Index: gcc/config/rs6000/rs6000.c
===================================================================
--- gcc/config/rs6000/rs6000.c	(revision 214957)
+++ gcc/config/rs6000/rs6000.c	(working copy)
@@ -33524,7 +33524,8 @@ enum special_handling_values {
   SH_SUBREG,
   SH_NOSWAP_LD,
   SH_NOSWAP_ST,
-  SH_EXTRACT
+  SH_EXTRACT,
+  SH_SPLAT
 };
 
 /* Union INSN with all insns containing definitions that reach USE.
@@ -33735,43 +33736,50 @@ rtx_is_swappable_p (rtx op, unsigned int *special)
 	   vector splat are element-order sensitive.  A few of these
 	   cases might be workable with special handling if required.  */
 	int val = XINT (op, 1);
-	if (val == UNSPEC_VMRGH_DIRECT
-	    || val == UNSPEC_VMRGL_DIRECT
-	    || val == UNSPEC_VPACK_SIGN_SIGN_SAT
-	    || val == UNSPEC_VPACK_SIGN_UNS_SAT
-	    || val == UNSPEC_VPACK_UNS_UNS_MOD
-	    || val == UNSPEC_VPACK_UNS_UNS_MOD_DIRECT
-	    || val == UNSPEC_VPACK_UNS_UNS_SAT
-	    || val == UNSPEC_VPERM
-	    || val == UNSPEC_VPERM_UNS
-	    || val == UNSPEC_VPERMHI
-	    || val == UNSPEC_VPERMSI
-	    || val == UNSPEC_VPKPX
-	    || val == UNSPEC_VSLDOI
-	    || val == UNSPEC_VSLO
-	    || val == UNSPEC_VSPLT_DIRECT
-	    || val == UNSPEC_VSRO
-	    || val == UNSPEC_VSUM2SWS
-	    || val == UNSPEC_VSUM4S
-	    || val == UNSPEC_VSUM4UBS
-	    || val == UNSPEC_VSUMSWS
-	    || val == UNSPEC_VSUMSWS_DIRECT
-	    || val == UNSPEC_VSX_CONCAT
-	    || val == UNSPEC_VSX_CVSPDP
-	    || val == UNSPEC_VSX_CVSPDPN
-	    || val == UNSPEC_VSX_SET
-	    || val == UNSPEC_VSX_SLDWI
-	    || val == UNSPEC_VUNPACK_HI_SIGN
-	    || val == UNSPEC_VUNPACK_HI_SIGN_DIRECT
-	    || val == UNSPEC_VUNPACK_LO_SIGN
-	    || val == UNSPEC_VUNPACK_LO_SIGN_DIRECT
-	    || val == UNSPEC_VUPKHPX
-	    || val == UNSPEC_VUPKHS_V4SF
-	    || val == UNSPEC_VUPKHU_V4SF
-	    || val == UNSPEC_VUPKLPX
-	    || val == UNSPEC_VUPKLS_V4SF
-	    || val == UNSPEC_VUPKHU_V4SF)
-	  return 0;
+	switch (val)
+	  {
+	  default:
+	    break;
+	  case UNSPEC_VMRGH_DIRECT:
+	  case UNSPEC_VMRGL_DIRECT:
+	  case UNSPEC_VPACK_SIGN_SIGN_SAT:
+	  case UNSPEC_VPACK_SIGN_UNS_SAT:
+	  case UNSPEC_VPACK_UNS_UNS_MOD:
+	  case UNSPEC_VPACK_UNS_UNS_MOD_DIRECT:
+	  case UNSPEC_VPACK_UNS_UNS_SAT:
+	  case UNSPEC_VPERM:
+	  case UNSPEC_VPERM_UNS:
+	  case UNSPEC_VPERMHI:
+	  case UNSPEC_VPERMSI:
+	  case UNSPEC_VPKPX:
+	  case UNSPEC_VSLDOI:
+	  case UNSPEC_VSLO:
+	  case UNSPEC_VSRO:
+	  case UNSPEC_VSUM2SWS:
+	  case UNSPEC_VSUM4S:
+	  case UNSPEC_VSUM4UBS:
+	  case UNSPEC_VSUMSWS:
+	  case UNSPEC_VSUMSWS_DIRECT:
+	  case UNSPEC_VSX_CONCAT:
+	  case UNSPEC_VSX_CVSPDP:
+	  case UNSPEC_VSX_CVSPDPN:
+	  case UNSPEC_VSX_SET:
+	  case UNSPEC_VSX_SLDWI:
+	  case UNSPEC_VUNPACK_HI_SIGN:
+	  case UNSPEC_VUNPACK_HI_SIGN_DIRECT:
+	  case UNSPEC_VUNPACK_LO_SIGN:
+	  case UNSPEC_VUNPACK_LO_SIGN_DIRECT:
+	  case UNSPEC_VUPKHPX:
+	  case UNSPEC_VUPKHS_V4SF:
+	  case UNSPEC_VUPKHU_V4SF:
+	  case UNSPEC_VUPKLPX:
+	  case UNSPEC_VUPKLS_V4SF:
+	  case UNSPEC_VUPKLU_V4SF:
+	    return 0;
+	  case UNSPEC_VSPLT_DIRECT:
+	    *special = SH_SPLAT;
+	    return 1;
+	  }
       }
 
     default:
@@ -34098,20 +34106,20 @@ permute_store (rtx_insn *insn)
 	     INSN_UID (insn));
 }
 
-/* Given OP that contains a vector extract operation, change the index
-   of the extracted lane to count from the other side of the vector.  */
+/* Given OP that contains a vector extract operation, adjust the index
+   of the extracted lane to account for the doubleword swap.  */
 static void
 adjust_extract (rtx_insn *insn)
 {
-  rtx body = PATTERN (insn);
+  rtx src = SET_SRC (PATTERN (insn));
   /* The vec_select may be wrapped in a vec_duplicate for a splat, so
      account for that.  */
-  rtx sel = (GET_CODE (body) == VEC_DUPLICATE
-	     ? XEXP (XEXP (body, 0), 1)
-	     : XEXP (body, 1));
+  rtx sel = GET_CODE (src) == VEC_DUPLICATE ? XEXP (src, 0) : src;
   rtx par = XEXP (sel, 1);
-  int nunits = GET_MODE_NUNITS (GET_MODE (XEXP (sel, 0)));
-  XVECEXP (par, 0, 0) = GEN_INT (nunits - 1 - INTVAL (XVECEXP (par, 0, 0)));
+  int half_elts = GET_MODE_NUNITS (GET_MODE (XEXP (sel, 0))) >> 1;
+  int lane = INTVAL (XVECEXP (par, 0, 0));
+  lane = lane >= half_elts ? lane - half_elts : lane + half_elts;
+  XVECEXP (par, 0, 0) = GEN_INT (lane);
   INSN_CODE (insn) = -1; /* Force re-recognition.  */
   df_insn_rescan (insn);
 
@@ -34119,6 +34127,24 @@ adjust_extract (rtx_insn *insn)
     fprintf (dump_file, "Changing lane for extract %d\n", INSN_UID (insn));
 }
 
+/* Given OP that contains a vector direct-splat operation, adjust the index
+   of the source lane to account for the doubleword swap.  */
+static void
+adjust_splat (rtx_insn *insn)
+{
+  rtx body = PATTERN (insn);
+  rtx unspec = XEXP (body, 1);
+  int half_elts = GET_MODE_NUNITS (GET_MODE (unspec)) >> 1;
+  int lane = INTVAL (XVECEXP (unspec, 0, 1));
+  lane = lane >= half_elts ? lane - half_elts : lane + half_elts;
+  XVECEXP (unspec, 0, 1) = GEN_INT (lane);
+  INSN_CODE (insn) = -1; /* Force re-recognition.  */
+  df_insn_rescan (insn);
+
+  if (dump_file)
+    fprintf (dump_file, "Changing lane for splat %d\n", INSN_UID (insn));
+}
+
 /* The insn described by INSN_ENTRY[I] can be swapped, but only
    with special handling.  Take care of that here.  */
 static void
@@ -34160,6 +34186,11 @@ handle_special_swappables (swap_web_entry *insn_en
     case SH_EXTRACT:
       /* Change the lane on an extract operation.  */
       adjust_extract (insn);
+      break;
+    case SH_SPLAT:
+      /* Change the lane on a direct-splat operation.  */
+      adjust_splat (insn);
+      break;
     }
 }
 
@@ -34230,6 +34261,8 @@ dump_swap_insn_table (swap_web_entry *insn_entry)
 	      fputs ("special:store ", dump_file);
 	    else if (insn_entry[i].special_handling == SH_EXTRACT)
 	      fputs ("special:extract ", dump_file);
+	    else if (insn_entry[i].special_handling == SH_SPLAT)
+	      fputs ("special:splat ", dump_file);
 	  }
 	if (insn_entry[i].web_not_optimizable)
 	  fputs ("unoptimizable ", dump_file);
Index: gcc/testsuite/gcc.target/powerpc/swaps-p8-16.c
===================================================================
--- gcc/testsuite/gcc.target/powerpc/swaps-p8-16.c	(revision 0)
+++ gcc/testsuite/gcc.target/powerpc/swaps-p8-16.c	(working copy)
@@ -0,0 +1,56 @@
+/* { dg-do compile { target { powerpc64le-*-* } } } */
+/* { dg-options "-mcpu=power8 -O3" } */
+/* { dg-final { scan-assembler "lxvd2x" } } */
+/* { dg-final { scan-assembler "stxvd2x" } } */
+/* { dg-final { scan-assembler "vspltw" } } */
+/* { dg-final { scan-assembler-not "xxpermdi" } } */
+
+#include <altivec.h>
+void abort();
+
+typedef struct xx {vector double l; vector double h;} xx;
+
+#define N 4096
+#define M 10000000
+vector float ca[N][4] = {0};
+vector float cb[N][4] = {0};
+vector float cc[N][4] = {0};
+
+__attribute__((noinline)) void foo ()
+{
+  int i;
+  vector float brow;
+
+  for (i = 0; i < N; i++) {
+
+    brow = cb[i][0];
+    cc[i][0] = vec_mul(vec_splats(brow[0]), ca[i][0]);
+    cc[i][0] = vec_madd(cc[i][0],vec_splats(brow[1]), ca[i][1]);
+    cc[i][0] = vec_madd(cc[i][0],vec_splats(brow[2]), ca[i][2]);
+    cc[i][0] = vec_madd(cc[i][0],vec_splats(brow[3]), ca[i][3]);
+
+    brow = cb[i][1];
+    cc[i][1] = vec_mul(vec_splats(brow[0]), ca[i][0]);
+    cc[i][1] = vec_madd(cc[i][0],vec_splats(brow[1]), ca[i][1]);
+    cc[i][1] = vec_madd(cc[i][0],vec_splats(brow[2]), ca[i][2]);
+    cc[i][1] = vec_madd(cc[i][0],vec_splats(brow[3]), ca[i][3]);
+    
+    brow = cb[i][2];
+    cc[i][2] = vec_mul(vec_splats(brow[0]), ca[i][0]);
+    cc[i][2] = vec_madd(cc[i][0],vec_splats(brow[1]), ca[i][1]);
+    cc[i][2] = vec_madd(cc[i][0],vec_splats(brow[2]), ca[i][2]);
+    cc[i][2] = vec_madd(cc[i][0],vec_splats(brow[3]), ca[i][3]);
+    
+    brow = cb[i][3];
+    cc[i][3] = vec_mul(vec_splats(brow[0]), ca[i][0]);
+    cc[i][3] = vec_madd(cc[i][0],vec_splats(brow[1]), ca[i][1]);
+    cc[i][3] = vec_madd(cc[i][0],vec_splats(brow[2]), ca[i][2]);
+    cc[i][3] = vec_madd(cc[i][0],vec_splats(brow[3]), ca[i][3]);
+  }
+}
+
+int main ()
+{
+  foo ();
+  return 0;
+}




More information about the Gcc-patches mailing list