[PATCH] rs6000: Vector shift-right should honor modulo semantics

Sun Feb 10 16:10:00 GMT 2019

Hi!

We had a problem report for code attempting to implement a vector right-shift for a
vector long long (V2DImode) type.  The programmer noted that we don't have a vector
splat-immediate for this mode, but cleverly realized he could use a vector char
splat-immediate since only the lower 6 bits of the shift count are read.  This is a
documented feature of both the vector shift built-ins and the underlying instruction.

Starting with GCC 8, the vector shifts are expanded early in rs6000_gimple_fold_builtin.
However, the GIMPLE folding does not currently perform the required TRUNC_MOD_EXPR to
implement the built-in semantics.  It appears that this was caught earlier for vector
shift-left and fixed, but the same problem was not fixed for vector shift-right.
This patch fixes that.

While fixing that problem, I noted that we get inferior code generation when we try
to fold the vector char splat earlier, due to the type mismatch and some additional
optimizations performed in the middle end.  Because this is a rare circumstance, it
makes sense to avoid the GIMPLE folding in this case and allow the back end to do
the expansion; this produces the clean code we're expecting with the vspltisb intact.

I've added executable tests for both shift-right algebraic and shift-right logical.
Both fail prior to applying the patch, and work correctly afterwards.

Bootstrapped and tested on powerpc64le-unknown-linux-gnu with no regressions.  Is
this okay for trunk, and for GCC 8.3 if there is no fallout by the end of the
week?

Thanks,
Bill


[gcc]

2019-02-08  Bill Schmidt  <wschmidt@linux.ibm.com>

	* config/rs6000/rs6000.c (rs6000_gimple_fold_builtin): Shift-right
	and shift-left vector built-ins need to include a TRUNC_MOD_EXPR
	for correct semantics.  Also, don't expand a vector-splat if there
	is a type mismatch; let the back end handle it.

[gcc/testsuite]

2019-02-08  Bill Schmidt  <wschmidt@linux.ibm.com>

	* gcc.target/powerpc/srad-modulo.c: New.
	* gcc.target/powerpc/srd-modulo.c: New.


Index: gcc/config/rs6000/rs6000.c
===================================================================

--- gcc/config/rs6000/rs6000.c	(revision 268707)
+++ gcc/config/rs6000/rs6000.c	(working copy)
@@ -15735,13 +15735,37 @@ rs6000_gimple_fold_builtin (gimple_stmt_iterator *
     case ALTIVEC_BUILTIN_VSRAH:
     case ALTIVEC_BUILTIN_VSRAW:
     case P8V_BUILTIN_VSRAD:
-      arg0 = gimple_call_arg (stmt, 0);
-      arg1 = gimple_call_arg (stmt, 1);
-      lhs = gimple_call_lhs (stmt);
-      g = gimple_build_assign (lhs, RSHIFT_EXPR, arg0, arg1);
-      gimple_set_location (g, gimple_location (stmt));
-      gsi_replace (gsi, g, true);
-      return true;
+      {
+	arg0 = gimple_call_arg (stmt, 0);
+	arg1 = gimple_call_arg (stmt, 1);
+	lhs = gimple_call_lhs (stmt);
+	tree arg1_type = TREE_TYPE (arg1);
+	tree unsigned_arg1_type = unsigned_type_for (TREE_TYPE (arg1));
+	tree unsigned_element_type = unsigned_type_for (TREE_TYPE (arg1_type));
+	location_t loc = gimple_location (stmt);
+	/* Force arg1 into the range valid matching the arg0 type.  */
+	/* Build a vector consisting of the max valid bit-size values.  */
+	int n_elts = VECTOR_CST_NELTS (arg1);
+	tree element_size = build_int_cst (unsigned_element_type,
+					   128 / n_elts);
+	tree_vector_builder elts (unsigned_arg1_type, n_elts, 1);
+	for (int i = 0; i < n_elts; i++)
+	  elts.safe_push (element_size);
+	tree modulo_tree = elts.build ();
+	/* Modulo the provided shift value against that vector.  */
+	gimple_seq stmts = NULL;
+	tree unsigned_arg1 = gimple_build (&stmts, VIEW_CONVERT_EXPR,
+					   unsigned_arg1_type, arg1);
+	tree new_arg1 = gimple_build (&stmts, loc, TRUNC_MOD_EXPR,
+				      unsigned_arg1_type, unsigned_arg1,
+				      modulo_tree);
+	gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
+	/* And finally, do the shift.  */
+	g = gimple_build_assign (lhs, RSHIFT_EXPR, arg0, new_arg1);
+	gimple_set_location (g, loc);
+	gsi_replace (gsi, g, true);
+	return true;
+      }
    /* Flavors of vector shift left.
       builtin_altivec_vsl{b,h,w} -> vsl{b,h,w}.  */
     case ALTIVEC_BUILTIN_VSLB:
@@ -15795,14 +15819,34 @@ rs6000_gimple_fold_builtin (gimple_stmt_iterator *
 	arg0 = gimple_call_arg (stmt, 0);
 	arg1 = gimple_call_arg (stmt, 1);
 	lhs = gimple_call_lhs (stmt);
+	tree arg1_type = TREE_TYPE (arg1);
+	tree unsigned_arg1_type = unsigned_type_for (TREE_TYPE (arg1));
+	tree unsigned_element_type = unsigned_type_for (TREE_TYPE (arg1_type));
+	location_t loc = gimple_location (stmt);
 	gimple_seq stmts = NULL;
 	/* Convert arg0 to unsigned.  */
 	tree arg0_unsigned
 	  = gimple_build (&stmts, VIEW_CONVERT_EXPR,
 			  unsigned_type_for (TREE_TYPE (arg0)), arg0);
+	/* Force arg1 into the range valid matching the arg0 type.  */
+	/* Build a vector consisting of the max valid bit-size values.  */
+	int n_elts = VECTOR_CST_NELTS (arg1);
+	tree element_size = build_int_cst (unsigned_element_type,
+					   128 / n_elts);
+	tree_vector_builder elts (unsigned_arg1_type, n_elts, 1);
+	for (int i = 0; i < n_elts; i++)
+	  elts.safe_push (element_size);
+	tree modulo_tree = elts.build ();
+	/* Modulo the provided shift value against that vector.  */
+	tree unsigned_arg1 = gimple_build (&stmts, VIEW_CONVERT_EXPR,
+					   unsigned_arg1_type, arg1);
+	tree new_arg1 = gimple_build (&stmts, loc, TRUNC_MOD_EXPR,
+				      unsigned_arg1_type, unsigned_arg1,
+				      modulo_tree);
+	/* Do the shift.  */
 	tree res
 	  = gimple_build (&stmts, RSHIFT_EXPR,
-			  TREE_TYPE (arg0_unsigned), arg0_unsigned, arg1);
+			  TREE_TYPE (arg0_unsigned), arg0_unsigned, new_arg1);
 	/* Convert result back to the lhs type.  */
 	res = gimple_build (&stmts, VIEW_CONVERT_EXPR, TREE_TYPE (lhs), res);
 	gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
@@ -16061,7 +16105,7 @@ rs6000_gimple_fold_builtin (gimple_stmt_iterator *
     case ALTIVEC_BUILTIN_VSPLTISH:
     case ALTIVEC_BUILTIN_VSPLTISW:
       {
-	int size;
+	unsigned int size;
 	if (fn_code == ALTIVEC_BUILTIN_VSPLTISB)
 	  size = 8;
 	else if (fn_code == ALTIVEC_BUILTIN_VSPLTISH)
@@ -16072,6 +16116,13 @@ rs6000_gimple_fold_builtin (gimple_stmt_iterator *
 	arg0 = gimple_call_arg (stmt, 0);
 	lhs = gimple_call_lhs (stmt);
 
+	/* It's sometimes useful to use one of these to build a
+	   splat for V2DImode, since the upper bits will be ignored.
+	   Don't fold if we detect that situation, as we end up
+	   losing the splat instruction in this case.  */
+	if (size != TREE_INT_CST_LOW (TYPE_SIZE (TREE_TYPE (TREE_TYPE (lhs)))))
+	  return false;
+
 	/* Only fold the vec_splat_*() if the lower bits of arg 0 is a
 	   5-bit signed constant in range -16 to +15.  */
 	if (TREE_CODE (arg0) != INTEGER_CST
Index: gcc/testsuite/gcc.target/powerpc/srad-modulo.c
===================================================================
--- gcc/testsuite/gcc.target/powerpc/srad-modulo.c	(nonexistent)
+++ gcc/testsuite/gcc.target/powerpc/srad-modulo.c	(working copy)
@@ -0,0 +1,43 @@
+/* Test that using a character splat to set up a shift-right algebraic
+   for a doubleword vector works correctly after gimple folding.  */
+
+/* { dg-do run { target { powerpc64*-*-* && vsx_hw } } } */
+/* { dg-options { "-O3" } } */
+
+#include <altivec.h>
+
+typedef __vector unsigned long long vui64_t;
+typedef __vector long long vi64_t;
+
+static inline vi64_t
+vec_sradi (vi64_t vra, const unsigned int shb)
+{
+  vui64_t rshift;
+  vi64_t result;
+
+  /* Note legitimate use of wrong-type splat due to expectation that only
+     lower 6-bits are read.  */
+  rshift = (vui64_t) vec_splat_s8((const unsigned char)shb);
+
+  /* Vector Shift Right Algebraic Doublewords based on the lower 6-bits
+     of corresponding element of rshift.  */
+  result = vec_vsrad (vra, rshift);
+
+  return (vi64_t) result;
+}
+
+vi64_t
+test_sradi_4 (vi64_t a)
+{
+  return vec_sradi (a, 4);
+}
+
+int
+main ()
+{
+  vi64_t x = {-256, 1025};
+  x = test_sradi_4 (x);
+  if (x[0] != -16 || x[1] != 64)
+    __builtin_abort ();
+  return 0;
+}
Index: gcc/testsuite/gcc.target/powerpc/srd-modulo.c
===================================================================
--- gcc/testsuite/gcc.target/powerpc/srd-modulo.c	(nonexistent)
+++ gcc/testsuite/gcc.target/powerpc/srd-modulo.c	(working copy)
@@ -0,0 +1,42 @@
+/* Test that using a character splat to set up a shift-right logical
+   for a doubleword vector works correctly after gimple folding.  */
+
+/* { dg-do run { target { powerpc64*-*-* && vsx_hw } } } */
+/* { dg-options { "-O3" } } */
+
+#include <altivec.h>
+
+typedef __vector unsigned long long vui64_t;
+
+static inline vui64_t
+vec_sradi (vui64_t vra, const unsigned int shb)
+{
+  vui64_t rshift;
+  vui64_t result;
+
+  /* Note legitimate use of wrong-type splat due to expectation that only
+     lower 6-bits are read.  */
+  rshift = (vui64_t) vec_splat_s8((const unsigned char)shb);
+
+  /* Vector Shift Right [Logical] Doublewords based on the lower 6-bits
+     of corresponding element of rshift.  */
+  result = vec_vsrd (vra, rshift);
+
+  return (vui64_t) result;
+}
+
+vui64_t
+test_sradi_4 (vui64_t a)
+{
+  return vec_sradi (a, 4);
+}
+
+int
+main ()
+{
+  vui64_t x = {1992357, 1025};
+  x = test_sradi_4 (x);
+  if (x[0] != 124522 || x[1] != 64)
+    __builtin_abort ();
+  return 0;
+}