This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PATCH, rs6000] Handle vec_extract and splat patterns in analyze_swaps


Hi,

This patch adds more special handling to analyze_swaps to allow us to
improve more computations.  Previously I had disallowed VEC_SELECT in
all cases.  This is now changed to allow a select of a single lane,
either for an extract operation or for a splat operation.  If a
computation containing such operations is optimized, the selected lane
is changed to count from the other end of the vector.  Several new tests
are added to check these opportunities are now exploited.

Bootstrapped and tested on powerpc64le-unknown-linux-gnu with no
regressions.  Is this ok for trunk?

Thanks,
Bill


[gcc]

2014-09-03  Bill Schmidt  <wschmidt@linux.vnet.ibm.com>

	* config/rs6000/rs6000.c (special_handling_values): Add
	SH_EXTRACT.
	(rtx_is_swappable_p): Look for patterns with a VEC_SELECT, perhaps
	wrapped in a VEC_DUPLICATE, representing an extract.  Mark these
	as swappable with special handling SH_EXTRACT.  Remove
	UNSPEC_VSX_XXSPLTW from the list of disallowed unspecs for the
	optimization.
	(adjust_extract): New function.
	(handle_special_swappables): Add default to case statement; add
	case for SH_EXTRACT that calls adjust_extract.
	(dump_swap_insn_table): Handle SH_EXTRACT.

[gcc/testsuite]

2014-09-03  Bill Schmidt  <wschmidt@linux.vnet.ibm.com>

	* gcc.target/powerpc/swaps-p8-13.c: New test.
	* gcc.target/powerpc/swaps-p8-14.c: New test.
	* gcc.target/powerpc/swaps-p8-15.c: New test.


Index: gcc/config/rs6000/rs6000.c
===================================================================
--- gcc/config/rs6000/rs6000.c	(revision 214879)
+++ gcc/config/rs6000/rs6000.c	(working copy)
@@ -33562,7 +33562,8 @@ enum special_handling_values {
   SH_CONST_VECTOR,
   SH_SUBREG,
   SH_NOSWAP_LD,
-  SH_NOSWAP_ST
+  SH_NOSWAP_ST,
+  SH_EXTRACT
 };
 
 /* Union INSN with all insns containing definitions that reach USE.
@@ -33704,6 +33705,7 @@ rtx_is_swappable_p (rtx op, unsigned int *special)
 {
   enum rtx_code code = GET_CODE (op);
   int i, j;
+  rtx parallel;
 
   switch (code)
     {
@@ -33714,7 +33716,6 @@ rtx_is_swappable_p (rtx op, unsigned int *special)
       return 1;
 
     case VEC_CONCAT:
-    case VEC_SELECT:
     case ASM_INPUT:
     case ASM_OPERANDS:
       return 0;
@@ -33732,9 +33733,31 @@ rtx_is_swappable_p (rtx op, unsigned int *special)
 	 handling.  */
       if (GET_CODE (XEXP (op, 0)) == CONST_INT)
 	return 1;
+      else if (GET_CODE (XEXP (op, 0)) == REG
+	       && GET_MODE_INNER (GET_MODE (op)) == GET_MODE (XEXP (op, 0)))
+	/* This catches V2DF and V2DI splat, at a minimum.  */
+	return 1;
+      else if (GET_CODE (XEXP (op, 0)) == VEC_SELECT)
+	/* If the duplicated item is from a select, defer to the select
+	   processing to see if we can change the lane for the splat.  */
+	return rtx_is_swappable_p (XEXP (op, 0), special);
       else
 	return 0;
 
+    case VEC_SELECT:
+      /* A vec_extract operation is ok if we change the lane.  */
+      if (GET_CODE (XEXP (op, 0)) == REG
+	  && GET_MODE_INNER (GET_MODE (XEXP (op, 0))) == GET_MODE (op)
+	  && GET_CODE ((parallel = XEXP (op, 1))) == PARALLEL
+	  && XVECLEN (parallel, 0) == 1
+	  && GET_CODE (XVECEXP (parallel, 0, 0)) == CONST_INT)
+	{
+	  *special = SH_EXTRACT;
+	  return 1;
+	}
+      else
+	return 0;
+
     case UNSPEC:
       {
 	/* Various operations are unsafe for this optimization, at least
@@ -33777,7 +33800,6 @@ rtx_is_swappable_p (rtx op, unsigned int *special)
 	    || val == UNSPEC_VSX_CVSPDPN
 	    || val == UNSPEC_VSX_SET
 	    || val == UNSPEC_VSX_SLDWI
-	    || val == UNSPEC_VSX_XXSPLTW
 	    || val == UNSPEC_VUNPACK_HI_SIGN
 	    || val == UNSPEC_VUNPACK_HI_SIGN_DIRECT
 	    || val == UNSPEC_VUNPACK_LO_SIGN
@@ -34115,6 +34137,27 @@ permute_store (rtx_insn *insn)
 	     INSN_UID (insn));
 }
 
+/* Given OP that contains a vector extract operation, change the index
+   of the extracted lane to count from the other side of the vector.  */
+static void
+adjust_extract (rtx_insn *insn)
+{
+  rtx body = PATTERN (insn);
+  /* The vec_select may be wrapped in a vec_duplicate for a splat, so
+     account for that.  */
+  rtx sel = (GET_CODE (body) == VEC_DUPLICATE
+	     ? XEXP (XEXP (body, 0), 1)
+	     : XEXP (body, 1));
+  rtx par = XEXP (sel, 1);
+  int nunits = GET_MODE_NUNITS (GET_MODE (XEXP (sel, 0)));
+  XVECEXP (par, 0, 0) = GEN_INT (nunits - 1 - INTVAL (XVECEXP (par, 0, 0)));
+  INSN_CODE (insn) = -1; /* Force re-recognition.  */
+  df_insn_rescan (insn);
+
+  if (dump_file)
+    fprintf (dump_file, "Changing lane for extract %d\n", INSN_UID (insn));
+}
+
 /* The insn described by INSN_ENTRY[I] can be swapped, but only
    with special handling.  Take care of that here.  */
 static void
@@ -34125,6 +34168,8 @@ handle_special_swappables (swap_web_entry *insn_en
 
   switch (insn_entry[i].special_handling)
     {
+    default:
+      gcc_unreachable ();
     case SH_CONST_VECTOR:
       {
 	/* A CONST_VECTOR will only show up somewhere in the RHS of a SET.  */
@@ -34151,6 +34196,9 @@ handle_special_swappables (swap_web_entry *insn_en
       /* Convert a non-permuting store to a permuting one.  */
       permute_store (insn);
       break;
+    case SH_EXTRACT:
+      /* Change the lane on an extract operation.  */
+      adjust_extract (insn);
     }
 }
 
@@ -34219,6 +34267,8 @@ dump_swap_insn_table (swap_web_entry *insn_entry)
 	      fputs ("special:load ", dump_file);
 	    else if (insn_entry[i].special_handling == SH_NOSWAP_ST)
 	      fputs ("special:store ", dump_file);
+	    else if (insn_entry[i].special_handling == SH_EXTRACT)
+	      fputs ("special:extract ", dump_file);
 	  }
 	if (insn_entry[i].web_not_optimizable)
 	  fputs ("unoptimizable ", dump_file);
Index: gcc/testsuite/gcc.target/powerpc/swaps-p8-13.c
===================================================================
--- gcc/testsuite/gcc.target/powerpc/swaps-p8-13.c	(revision 0)
+++ gcc/testsuite/gcc.target/powerpc/swaps-p8-13.c	(working copy)
@@ -0,0 +1,53 @@
+/* { dg-do run { target { powerpc64le-*-* } } } */
+/* { dg-options "-mcpu=power8 -O3" } */
+
+#include <altivec.h>
+void abort ();
+
+#define N 4096
+long long ca[N] __attribute__((aligned(16)));
+long long cb[N] __attribute__((aligned(16)));
+long long cc[N] __attribute__((aligned(16)));
+long long cd[N] __attribute__((aligned(16)));
+long long x;
+
+__attribute__((noinline)) void foo ()
+{
+  int i;
+  vector long long va, vb, vc, vd, tmp;
+  volatile unsigned long long three = 3;
+  vector unsigned long long threes = vec_splats (three);
+  for (i = 0; i < N; i+=2) {
+    vb = vec_vsx_ld (0, (vector long long *)&cb[i]);
+    vc = vec_vsx_ld (0, (vector long long *)&cc[i]);
+    vd = vec_vsx_ld (0, (vector long long *)&cd[i]);
+    tmp = vec_add (vb, vc);
+    tmp = vec_sub (tmp, vd);
+    tmp = vec_sra (tmp, threes);
+    x = vec_extract (tmp, 0);
+    vec_vsx_st (tmp, 0, (vector long long *)&ca[i]);
+  }
+}
+
+__attribute__((noinline)) void init ()
+{
+  int i;
+  for (i = 0; i < N; ++i) {
+    cb[i] = 3 * i - 2048;
+    cc[i] = -5 * i + 93;
+    cd[i] = i + 14;
+  }
+}
+
+int main ()
+{
+  int i;
+  init ();
+  foo ();
+  for (i = 0; i < N; ++i)
+    if (ca[i] != (-3 * i - 1969) >> 3)
+      abort ();
+  if (x != ca[N-1])
+    abort ();
+  return 0;
+}
Index: gcc/testsuite/gcc.target/powerpc/swaps-p8-14.c
===================================================================
--- gcc/testsuite/gcc.target/powerpc/swaps-p8-14.c	(revision 0)
+++ gcc/testsuite/gcc.target/powerpc/swaps-p8-14.c	(working copy)
@@ -0,0 +1,42 @@
+/* { dg-do compile { target { powerpc64le-*-* } } } */
+/* { dg-options "-mcpu=power8 -O3" } */
+/* { dg-final { scan-assembler "lxvd2x" } } */
+/* { dg-final { scan-assembler "stxvd2x" } } */
+/* { dg-final { scan-assembler "stxsdx" } } */
+/* { dg-final { scan-assembler-times "xxpermdi" 1 } } */
+
+/* The only xxpermdi expected is for the vec_splats.  */
+
+#include <altivec.h>
+void abort ();
+
+#define N 4096
+long long ca[N] __attribute__((aligned(16)));
+long long cb[N] __attribute__((aligned(16)));
+long long cc[N] __attribute__((aligned(16)));
+long long cd[N] __attribute__((aligned(16)));
+long long x;
+
+__attribute__((noinline)) void foo ()
+{
+  int i;
+  vector long long va, vb, vc, vd, tmp;
+  volatile unsigned long long three = 3;
+  vector unsigned long long threes = vec_splats (three);
+  for (i = 0; i < N; i+=2) {
+    vb = vec_vsx_ld (0, (vector long long *)&cb[i]);
+    vc = vec_vsx_ld (0, (vector long long *)&cc[i]);
+    vd = vec_vsx_ld (0, (vector long long *)&cd[i]);
+    tmp = vec_add (vb, vc);
+    tmp = vec_sub (tmp, vd);
+    tmp = vec_sra (tmp, threes);
+    x = vec_extract (tmp, 0);
+    vec_vsx_st (tmp, 0, (vector long long *)&ca[i]);
+  }
+}
+
+int main ()
+{
+  foo ();
+  return 0;
+}
Index: gcc/testsuite/gcc.target/powerpc/swaps-p8-15.c
===================================================================
--- gcc/testsuite/gcc.target/powerpc/swaps-p8-15.c	(revision 0)
+++ gcc/testsuite/gcc.target/powerpc/swaps-p8-15.c	(working copy)
@@ -0,0 +1,49 @@
+/* { dg-do compile { target { powerpc64le-*-* } } } */
+/* { dg-options "-mcpu=power8 -O3" } */
+/* { dg-final { scan-assembler "lxvd2x" } } */
+/* { dg-final { scan-assembler "stxvd2x" } } */
+/* { dg-final { scan-assembler "xxspltw" } } */
+/* { dg-final { scan-assembler-not "xxpermdi" } } */
+
+#include <altivec.h>
+void abort();
+
+typedef struct xx {vector double l; vector double h;} xx;
+
+#define N 4096
+#define M 10000000
+vector float ca[N][4] = {0};
+vector float cb[N][4] = {0};
+vector float cc[N][4] = {0};
+
+__attribute__((noinline)) void foo ()
+{
+  int i;
+  for (i = 0; i < N; i++) {
+    cc[i][0] = vec_mul(vec_splats(cb[i][0][0]), ca[i][0]);
+    cc[i][0] = vec_madd(cc[i][0],vec_splats(cb[i][0][1]), ca[i][1]);
+    cc[i][0] = vec_madd(cc[i][0],vec_splats(cb[i][0][2]), ca[i][2]);
+    cc[i][0] = vec_madd(cc[i][0],vec_splats(cb[i][0][3]), ca[i][3]);
+
+    cc[i][1] = vec_mul(vec_splats(cb[i][1][0]), ca[i][0]);
+    cc[i][1] = vec_madd(cc[i][0],vec_splats(cb[i][1][1]), ca[i][1]);
+    cc[i][1] = vec_madd(cc[i][0],vec_splats(cb[i][1][2]), ca[i][2]);
+    cc[i][1] = vec_madd(cc[i][0],vec_splats(cb[i][1][3]), ca[i][3]);
+    
+    cc[i][2] = vec_mul(vec_splats(cb[i][2][0]), ca[i][0]);
+    cc[i][2] = vec_madd(cc[i][0],vec_splats(cb[i][2][1]), ca[i][1]);
+    cc[i][2] = vec_madd(cc[i][0],vec_splats(cb[i][2][2]), ca[i][2]);
+    cc[i][2] = vec_madd(cc[i][0],vec_splats(cb[i][2][3]), ca[i][3]);
+    
+    cc[i][3] = vec_mul(vec_splats(cb[i][3][0]), ca[i][0]);
+    cc[i][3] = vec_madd(cc[i][0],vec_splats(cb[i][3][1]), ca[i][1]);
+    cc[i][3] = vec_madd(cc[i][0],vec_splats(cb[i][3][2]), ca[i][2]);
+    cc[i][3] = vec_madd(cc[i][0],vec_splats(cb[i][3][3]), ca[i][3]);
+  }
+}
+
+int main ()
+{
+  foo ();
+  return 0;
+}



Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]