[PATCH, rs6000] Correct vsx_set and vsx_extract patterns for little endian

Wed Nov 20 20:34:00 GMT 2013

Hi,

This patch corrects the various vsx_set_* and vsx_extract_* patterns to
work correctly with little endian.  For the most part this requires the
usual "subtract from N-1" modification, where N is the number of
elements.

Extracting element zero for big endian V2DI or V2DF mode is optimized
using the scalar register equivalence.  Since we can similarly optimize
extraction of element one for big endian V2DI or V2DF mode, I added a
variant that does this.  I am not sure how useful this is, and we can
remove it if you like.

The existing testcase gcc.target/powerpc/pr48258-1.c fails when counting
the number of occurrences of xxsldwi.  It expects to see 6, but we
generate 9 of them for LE.  This is because there are three extracts of
element zero of a V4SF in the testcase.  The scalar equivalence allows
us to avoid the xxsldwi in BE but not in LE.  Therefore I've disabled
this test for little endian.

Bootstrapped and tested on powerpc64{,le}-unknown-linux-gnu with no
regressions.  Is this ok for trunk?

Thanks,
Bill


gcc:

2013-11-20  Bill Schmidt  <wschmidt@linux.vnet.ibm.com>

	* config/rs6000/vsx.md (vsx_set_<mode>): Adjust for little endian.
	(vsx_extract_<mode>): Likewise.
	(*vsx_extract_<mode>_one_le): New LE variant on
	*vsx_extract_<mode>_zero.
	(vsx_extract_v4sf): Adjust for little endian.


gcc/testsuite:

2013-11-20  Bill Schmidt  <wschmidt@linux.vnet.ibm.com>

	* gcc.target/powerpc/pr48258-1.c: Skip for little endian.


Index: gcc/testsuite/gcc.target/powerpc/pr48258-1.c
===================================================================

--- gcc/testsuite/gcc.target/powerpc/pr48258-1.c	(revision 205053)
+++ gcc/testsuite/gcc.target/powerpc/pr48258-1.c	(working copy)
@@ -1,5 +1,6 @@
 /* { dg-do compile } */
 /* { dg-skip-if "" { powerpc*-*-darwin* } { "*" } { "" } } */
+/* { dg-skip-if "" { powerpc*le-*-* } { "*" } { "" } } */
 /* { dg-require-effective-target powerpc_vsx_ok } */
 /* { dg-options "-O3 -mcpu=power7 -mabi=altivec -ffast-math -fno-unroll-loops" } */
 /* { dg-final { scan-assembler-times "xvaddsp" 3 } } */
Index: gcc/config/rs6000/vsx.md
===================================================================
--- gcc/config/rs6000/vsx.md	(revision 205053)
+++ gcc/config/rs6000/vsx.md	(working copy)
@@ -1497,9 +1497,10 @@
 		      UNSPEC_VSX_SET))]
   "VECTOR_MEM_VSX_P (<MODE>mode)"
 {
-  if (INTVAL (operands[3]) == 0)
+  int idx_first = BYTES_BIG_ENDIAN ? 0 : 1;
+  if (INTVAL (operands[3]) == idx_first)
     return \"xxpermdi %x0,%x2,%x1,1\";
-  else if (INTVAL (operands[3]) == 1)
+  else if (INTVAL (operands[3]) == 1 - idx_first)
     return \"xxpermdi %x0,%x1,%x2,0\";
   else
     gcc_unreachable ();
@@ -1514,8 +1515,12 @@
 			[(match_operand:QI 2 "u5bit_cint_operand" "i,i,i")])))]
   "VECTOR_MEM_VSX_P (<MODE>mode)"
 {
+  int fldDM;
   gcc_assert (UINTVAL (operands[2]) <= 1);
-  operands[3] = GEN_INT (INTVAL (operands[2]) << 1);
+  fldDM = INTVAL (operands[2]) << 1;
+  if (!BYTES_BIG_ENDIAN)
+    fldDM = 3 - fldDM;
+  operands[3] = GEN_INT (fldDM);
   return \"xxpermdi %x0,%x1,%x1,%3\";
 }
   [(set_attr "type" "vecperm")])
@@ -1535,6 +1540,21 @@
 	(const_string "fpload")))
    (set_attr "length" "4")])  
 
+;; Optimize extracting element 1 from memory for little endian
+(define_insn "*vsx_extract_<mode>_one_le"
+  [(set (match_operand:<VS_scalar> 0 "vsx_register_operand" "=ws,d,?wa")
+	(vec_select:<VS_scalar>
+	 (match_operand:VSX_D 1 "indexed_or_indirect_operand" "Z,Z,Z")
+	 (parallel [(const_int 1)])))]
+  "VECTOR_MEM_VSX_P (<MODE>mode) && !WORDS_BIG_ENDIAN"
+  "lxsd%U1x %x0,%y1"
+  [(set (attr "type")
+      (if_then_else
+	(match_test "update_indexed_address_mem (operands[1], VOIDmode)")
+	(const_string "fpload_ux")
+	(const_string "fpload")))
+   (set_attr "length" "4")])  
+
 ;; Extract a SF element from V4SF
 (define_insn_and_split "vsx_extract_v4sf"
   [(set (match_operand:SF 0 "vsx_register_operand" "=f,f")
@@ -1555,7 +1575,7 @@
   rtx op2 = operands[2];
   rtx op3 = operands[3];
   rtx tmp;
-  HOST_WIDE_INT ele = INTVAL (op2);
+  HOST_WIDE_INT ele = BYTES_BIG_ENDIAN ? INTVAL (op2) : 3 - INTVAL (op2);
 
   if (ele == 0)
     tmp = op1;