From 2a530703c3b7b893c83a13cc8e9e211bddb428b0 Mon Sep 17 00:00:00 2001
From: Joel Hutton <joel.hutton@arm.com>
Date: Fri, 5 Feb 2021 12:52:12 +0000
Subject: [PATCH] [aarch64][vect] Support V8QI->V8HI WIDEN_ patterns

In the case where 8 out of every 16 elements are widened using a
widening pattern and the next 8 are skipped the patterns are not
recognized. This is because they are normally used in a pair, such  as
VEC_WIDEN_MINUS_HI/LO, to achieve a v16qi->v16hi conversion for example.
This patch adds support for V8QI->V8HI patterns.

gcc/ChangeLog:

    PR tree-optimisation/98772
    * optabs-tree.c (supportable_half_widening_operation): New function
    to check for supportable V8QI->V8HI widening patterns.
    * optabs-tree.h (supportable_half_widening_operation): New function.
    * tree-vect-stmts.c (vect_create_vectorized_promotion_stmts): New
    function to create promotion stmts for V8QI->V8HI widening patterns.
    (vectorizable_conversion): Add case for V8QI->V8HI.

gcc/testsuite/ChangeLog:

    PR tree-optimisation/98772
    * gcc.target/aarch64/pr98772.c: New test.
---
 gcc/optabs-tree.c                          |  75 ++++++++++
 gcc/optabs-tree.h                          |   3 +
 gcc/testsuite/gcc.target/aarch64/pr98772.c | 155 +++++++++++++++++++++
 gcc/tree-vect-stmts.c                      |  93 +++++++++++--
 4 files changed, 318 insertions(+), 8 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/pr98772.c

diff --git a/gcc/optabs-tree.c b/gcc/optabs-tree.c
index c94073e3ed9..6cd12a807f9 100644
--- a/gcc/optabs-tree.c
+++ b/gcc/optabs-tree.c
@@ -277,6 +277,81 @@ optab_for_tree_code (enum tree_code code, const_tree type,
     }
 }
 
+/* Function supportable_half_widening_operation
+
+   Check whether an operation represented by code is a 'half' widening operation
+   using only half of the full vector width for the inputs. This allows the
+   output to be stored in a single vector, as opposed to using the full width
+   for input, where half the elements must be processed at a time into
+   respective full-width vector outputs i.e. a 'hi'/'lo' pair using codes such
+   as VEC_WIDEN_MINUS_HI/LO. This is handled by widening the inputs and using
+   non-widening instructions. RTL fusing converts these to the widening hardware
+   instructions if supported.
+
+   Supported widening operations:
+    WIDEN_MINUS_EXPR
+    WIDEN_PLUS_EXPR
+    WIDEN_MULT_EXPR
+    WIDEN_LSHIFT_EXPR
+
+   Output:
+   - CODE1 - The non-widened code, which will be used after the inputs are
+     converted to the wide type.
+   */
+bool
+supportable_half_widening_operation (enum tree_code code,
+			       tree vectype_out, tree vectype_in,
+			       enum tree_code *code1)
+{
+  machine_mode m1,m2;
+  bool truncp;
+  enum tree_code dummy_code;
+  optab op;
+
+  gcc_assert (VECTOR_TYPE_P (vectype_out) && VECTOR_TYPE_P (vectype_in));
+
+  m1 = TYPE_MODE (vectype_out);
+  m2 = TYPE_MODE (vectype_in);
+
+  if (!VECTOR_MODE_P (m1) || !VECTOR_MODE_P (m2))
+    return false;
+
+  if(!known_eq (TYPE_VECTOR_SUBPARTS (vectype_in),
+		  TYPE_VECTOR_SUBPARTS (vectype_out)))
+      return false;
+
+  if(!(code == WIDEN_MINUS_EXPR
+       || code == WIDEN_PLUS_EXPR
+       || code == WIDEN_MULT_EXPR
+       || code == WIDEN_LSHIFT_EXPR))
+    return false;
+
+  switch (code)
+  {
+    case WIDEN_LSHIFT_EXPR:
+      *code1 = LSHIFT_EXPR;
+      break;
+    case WIDEN_MINUS_EXPR:
+      *code1 = MINUS_EXPR;
+      break;
+    case WIDEN_PLUS_EXPR:
+      *code1 = PLUS_EXPR;
+      break;
+    case WIDEN_MULT_EXPR:
+      *code1 = MULT_EXPR;
+      break;
+    default:
+      gcc_unreachable ();
+  }
+
+  if (!supportable_convert_operation(NOP_EXPR, vectype_out, vectype_in,
+				     &dummy_code))
+    return false;
+
+  op = optab_for_tree_code (*code1, vectype_out, optab_vector);
+  return (optab_handler (op, TYPE_MODE (vectype_out)) != CODE_FOR_nothing);
+}
+
 /* Function supportable_convert_operation
 
    Check whether an operation represented by the code CODE is a
diff --git a/gcc/optabs-tree.h b/gcc/optabs-tree.h
index 173da0d3bd2..c3aaa1a4169 100644
--- a/gcc/optabs-tree.h
+++ b/gcc/optabs-tree.h
@@ -36,6 +36,9 @@ enum optab_subtype
    the second argument.  The third argument distinguishes between the types of
    vector shifts and rotates.  */
 optab optab_for_tree_code (enum tree_code, const_tree, enum optab_subtype);
+bool
+supportable_half_widening_operation (enum tree_code, tree, tree,
+				    enum tree_code *);
 bool supportable_convert_operation (enum tree_code, tree, tree,
 				    enum tree_code *);
 bool expand_vec_cmp_expr_p (tree, tree, enum tree_code);
diff --git a/gcc/testsuite/gcc.target/aarch64/pr98772.c b/gcc/testsuite/gcc.target/aarch64/pr98772.c
new file mode 100644
index 00000000000..35568a9f9d6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr98772.c
@@ -0,0 +1,155 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -save-temps" } */
+#include <stdint.h>
+#include <string.h>
+
+#define DSIZE 16
+#define PIXSIZE 64
+
+extern void
+wplus (uint16_t *d, uint8_t *restrict pix1, uint8_t *restrict pix2 )
+{
+    for( int y = 0; y < 4; y++ )
+    {
+	for( int x = 0; x < 4; x++ )
+	    d[x + y*4] = pix1[x] + pix2[x];
+	pix1 += 16;
+	pix2 += 16;
+    }
+}
+extern void __attribute__((optimize (0)))
+wplus_no_opt (uint16_t *d, uint8_t *restrict pix1, uint8_t *restrict pix2 )
+{
+    for( int y = 0; y < 4; y++ )
+    {
+	for( int x = 0; x < 4; x++ )
+	    d[x + y*4] = pix1[x] + pix2[x];
+	pix1 += 16;
+	pix2 += 16;
+    }
+}
+
+extern void
+wminus (uint16_t *d, uint8_t *restrict pix1, uint8_t *restrict pix2 )
+{
+    for( int y = 0; y < 4; y++ )
+    {
+	for( int x = 0; x < 4; x++ )
+	    d[x + y*4] = pix1[x] - pix2[x];
+	pix1 += 16;
+	pix2 += 16;
+    }
+}
+extern void __attribute__((optimize (0)))
+wminus_no_opt (uint16_t *d, uint8_t *restrict pix1, uint8_t *restrict pix2 )
+{
+    for( int y = 0; y < 4; y++ )
+    {
+	for( int x = 0; x < 4; x++ )
+	    d[x + y*4] = pix1[x] - pix2[x];
+	pix1 += 16;
+	pix2 += 16;
+    }
+}
+
+extern void
+wmult (uint16_t *d, uint8_t *restrict pix1, uint8_t *restrict pix2 )
+{
+    for( int y = 0; y < 4; y++ )
+    {
+	for( int x = 0; x < 4; x++ )
+	    d[x + y*4] = pix1[x] * pix2[x];
+	pix1 += 16;
+	pix2 += 16;
+    }
+}
+extern void __attribute__((optimize (0)))
+wmult_no_opt (uint16_t *d, uint8_t *restrict pix1, uint8_t *restrict pix2 )
+{
+    for( int y = 0; y < 4; y++ )
+    {
+	for( int x = 0; x < 4; x++ )
+	    d[x + y*4] = pix1[x] * pix2[x];
+	pix1 += 16;
+	pix2 += 16;
+    }
+}
+
+extern void
+wlshift (uint16_t *d, uint8_t *restrict pix1)
+
+{
+    for( int y = 0; y < 4; y++ )
+    {
+	for( int x = 0; x < 4; x++ )
+	    d[x + y*4] = pix1[x] << 8;
+	pix1 += 16;
+    }
+}
+extern void __attribute__((optimize (0)))
+wlshift_no_opt (uint16_t *d, uint8_t *restrict pix1)
+
+{
+    for( int y = 0; y < 4; y++ )
+    {
+	for( int x = 0; x < 4; x++ )
+	    d[x + y*4] = pix1[x] << 8;
+	pix1 += 16;
+    }
+}
+
+void __attribute__((optimize (0)))
+init_arrays(uint16_t *d_a, uint16_t *d_b, uint8_t *pix1, uint8_t *pix2)
+{
+  for(int i = 0; i < DSIZE; i++)
+  {
+    d_a[i] = (1074 * i)%17;
+    d_b[i] = (1074 * i)%17;
+  }
+  for(int i = 0; i < PIXSIZE; i++)
+  {
+    pix1[i] = (1024 * i)%17;
+    pix2[i] = (1024 * i)%17;
+  }
+}
+
+/* Don't optimize main so we don't get confused over where the vector
+   instructions are generated. */
+__attribute__((optimize (0)))
+int main()
+{
+  uint16_t d_a[DSIZE];
+  uint16_t d_b[DSIZE];
+  uint8_t pix1[PIXSIZE];
+  uint8_t pix2[PIXSIZE];
+
+  init_arrays (d_a, d_b, pix1, pix2);
+  wplus(d_a, pix1, pix2);
+  wplus_no_opt(d_b, pix1, pix2);
+  if (memcmp(d_a,d_b, DSIZE) != 0)
+    return 1;
+
+  init_arrays (d_a, d_b, pix1, pix2);
+  wminus(d_a, pix1, pix2);
+  wminus_no_opt(d_b, pix1, pix2);
+  if (memcmp(d_a,d_b, DSIZE) != 0)
+    return 2;
+
+  init_arrays (d_a, d_b, pix1, pix2);
+  wmult(d_a, pix1, pix2);
+  wmult_no_opt(d_b, pix1, pix2);
+  if (memcmp(d_a,d_b, DSIZE) != 0)
+    return 3;
+
+  init_arrays (d_a, d_b, pix1, pix2);
+  wlshift(d_a, pix1);
+  wlshift_no_opt(d_b, pix1);
+  if (memcmp(d_a,d_b, DSIZE) != 0)
+    return 4;
+
+}
+
+/* { dg-final { scan-assembler-times "uaddl\\tv" 2 } } */
+/* { dg-final { scan-assembler-times "usubl\\tv" 2 } } */
+/* { dg-final { scan-assembler-times "umull\\tv" 2 } } */
+/* { dg-final { scan-assembler-times "shl\\tv" 2 } } */
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
index f180ced3124..3cfb27f4adc 100644
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -4545,6 +4545,64 @@ vect_create_vectorized_promotion_stmts (vec_info *vinfo,
   *vec_oprnds0 = vec_tmp;
 }
 
+/* Create vectorized promotion stmts for widening stmts using only half the
+   potential vector size for input.  */
+static void
+vect_create_vectorized_promotion_stmts (vec_info *vinfo,
+					vec<tree> *vec_oprnds0,
+					vec<tree> *vec_oprnds1,
+					stmt_vec_info stmt_info, tree vec_dest,
+					gimple_stmt_iterator *gsi,
+					enum tree_code code1,
+					int op_type)
+{
+  int i;
+  tree vop0, vop1;
+  gimple *new_stmt1;
+  gimple *new_stmt2;
+  gimple *new_stmt3;
+  vec<tree> vec_tmp = vNULL;
+
+  vec_tmp.create (vec_oprnds0->length ());
+  FOR_EACH_VEC_ELT (*vec_oprnds0, i, vop0)
+    {
+      tree new_tmp1, new_tmp2, new_tmp3, out_type;
+
+      gcc_assert (op_type == binary_op);
+      vop1 = (*vec_oprnds1)[i];
+
+      /* Widen the first vector input.  */
+      out_type = TREE_TYPE (vec_dest);
+      new_tmp1 = make_ssa_name (out_type);
+      new_stmt1 = gimple_build_assign (new_tmp1, NOP_EXPR, vop0);
+      vect_finish_stmt_generation (vinfo, stmt_info, new_stmt1, gsi);
+      if (VECTOR_TYPE_P (TREE_TYPE (vop1)))
+	{
+	  /* Widen the second vector input.  */
+	  new_tmp2 = make_ssa_name (out_type);
+	  new_stmt2 = gimple_build_assign (new_tmp2, NOP_EXPR, vop1);
+	  vect_finish_stmt_generation (vinfo, stmt_info, new_stmt2, gsi);
+	  /* Perform the operation.  With both vector inputs widened.  */
+	  new_stmt3 = gimple_build_assign (vec_dest, code1, new_tmp1, new_tmp2);
+	}
+      else
+	{
+	  /* Perform the operation.  With the single vector input widened.  */
+	  new_stmt3 = gimple_build_assign (vec_dest, code1, new_tmp1, vop1);
+      }
+
+      new_tmp3 = make_ssa_name (vec_dest, new_stmt3);
+      gimple_assign_set_lhs (new_stmt3, new_tmp3);
+      vect_finish_stmt_generation (vinfo, stmt_info, new_stmt3, gsi);
+
+      /* Store the results for the next step.  */
+      vec_tmp.quick_push (new_tmp3);
+    }
+
+  vec_oprnds0->release ();
+  *vec_oprnds0 = vec_tmp;
+}
+
 
 /* Check if STMT_INFO performs a conversion operation that can be vectorized.
    If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
@@ -4697,7 +4755,13 @@ vectorizable_conversion (vec_info *vinfo,
   nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
   nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
   if (known_eq (nunits_out, nunits_in))
-    modifier = NONE;
+    if (code == WIDEN_MINUS_EXPR
+	|| code == WIDEN_PLUS_EXPR
+	|| code == WIDEN_LSHIFT_EXPR
+	|| code == WIDEN_MULT_EXPR)
+	modifier = WIDEN;
+    else
+      modifier = NONE;
   else if (multiple_p (nunits_out, nunits_in))
     modifier = NARROW;
   else
@@ -4743,9 +4807,16 @@ vectorizable_conversion (vec_info *vinfo,
       return false;
 
     case WIDEN:
-      if (supportable_widening_operation (vinfo, code, stmt_info, vectype_out,
-					  vectype_in, &code1, &code2,
-					  &multi_step_cvt, &interm_types))
+      if (supportable_half_widening_operation (code, vectype_out, vectype_in,
+					    &code1))
+	{
+	  gcc_assert (!(multi_step_cvt && op_type == binary_op));
+	  break;
+	}
+      else if (supportable_widening_operation (vinfo, code, stmt_info,
+					       vectype_out, vectype_in, &code1,
+					       &code2, &multi_step_cvt,
+					       &interm_types))
 	{
 	  /* Binary widening operation can only be supported directly by the
 	     architecture.  */
@@ -4981,10 +5052,16 @@ vectorizable_conversion (vec_info *vinfo,
 	      c1 = codecvt1;
 	      c2 = codecvt2;
 	    }
-	  vect_create_vectorized_promotion_stmts (vinfo, &vec_oprnds0,
-						  &vec_oprnds1, stmt_info,
-						  this_dest, gsi,
-						  c1, c2, op_type);
+	  if (known_eq(nunits_out, nunits_in))
+	    vect_create_vectorized_promotion_stmts (vinfo, &vec_oprnds0,
+						    &vec_oprnds1, stmt_info,
+						    this_dest, gsi,
+						    c1, op_type);
+	  else
+	    vect_create_vectorized_promotion_stmts (vinfo, &vec_oprnds0,
+						    &vec_oprnds1, stmt_info,
+						    this_dest, gsi,
+						    c1, c2, op_type);
 	}
 
       FOR_EACH_VEC_ELT (vec_oprnds0, i, vop0)
-- 
2.17.1