[PATCH, RFC] First cut at using vec_construct for strided loads

Wed Jun 8 12:30:00 GMT 2016

On Wed, Jun 13, 2012 at 4:18 AM, William J. Schmidt
<wschmidt@linux.vnet.ibm.com> wrote:
> This patch is a follow-up to the discussion generated by
> http://gcc.gnu.org/ml/gcc-patches/2012-06/msg00546.html.  I've added
> vec_construct to the cost model for use in vect_model_load_cost, and
> implemented a cost calculation that makes sense to me for PowerPC.  I'm
> less certain about the default, i386, and spu implementations.  I took a
> guess at i386 from the discussions we had, and used the same calculation
> for the default and for spu.  I'm hoping you or others can fill in the
> blanks if I guessed badly.
>
> The i386 cost for vec_construct is different from all the others, which
> are parameterized for each processor description.  This should probably
> be parameterized in some way as well, but thought you'd know better than
> I how that should be.  Perhaps instead of
>
>         elements / 2 + 1
>
> it should be
>
>         (elements / 2) * X + Y
>
> where X and Y are taken from the processor description, and represent
> the cost of a merge and a permute, respectively.  Let me know what you
> think.

Just trying to understand how you arrived at the above formulas in investigating
strangely low cost for v16qi construction of 9.  If we pairwise reduce elements
with a cost of 1 then we arrive at a cost of elements - 1, that's what you'd
get with not accounting an initial move of element zero into a vector and then
inserting each other element into that with elements - 1 inserts.

This also matches up with code-generation on x86_64 for

vT foo (T a, T b, ...)
{
  return (vT) {a, b, ... };
}

for any vector / element type combination I tried.  Thus the patch below.

I'll bootstrap / test that on x86_64-linux and I'm leaving other
targets to target
maintainers.

Ok for the i386 parts?

Thanks,
Richard.

2016-06-08  Richard Biener  <rguenther@suse.de>

        * targhooks.c (default_builtin_vectorization_cost): Adjust
        vec_construct cost.
        * config/i386/i386.c (ix86_builtin_vectorization_cost): Likewise.

Index: gcc/targhooks.c
===================================================================

--- gcc/targhooks.c     (revision 237196)
+++ gcc/targhooks.c     (working copy)
@@ -589,8 +589,7 @@ default_builtin_vectorization_cost (enum
         return 3;

       case vec_construct:
-       elements = TYPE_VECTOR_SUBPARTS (vectype);
-       return elements / 2 + 1;
+       return TYPE_VECTOR_SUBPARTS (vectype) - 1;

       default:
         gcc_unreachable ();
Index: gcc/config/i386/i386.c
===================================================================
--- gcc/config/i386/i386.c      (revision 237196)
+++ gcc/config/i386/i386.c      (working copy)
@@ -49503,8 +49520,6 @@ static int
 ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
                                  tree vectype, int)
 {
-  unsigned elements;
-
   switch (type_of_cost)
     {
       case scalar_stmt:
@@ -49546,8 +49561,7 @@ ix86_builtin_vectorization_cost (enum ve
         return ix86_cost->vec_stmt_cost;

       case vec_construct:
-       elements = TYPE_VECTOR_SUBPARTS (vectype);
-       return ix86_cost->vec_stmt_cost * (elements / 2 + 1);
+       return ix86_cost->vec_stmt_cost * (TYPE_VECTOR_SUBPARTS (vectype) - 1);

       default:
         gcc_unreachable ();


> Thanks,
> Bill
>
>
> 2012-06-12  Bill Schmidt  <wschmidt@linux.ibm.com>
>
>         * targhooks.c (default_builtin_vectorized_conversion): Handle
>         vec_construct, using vectype to base cost on subparts.
>         * target.h (enum vect_cost_for_stmt): Add vec_construct.
>         * tree-vect-stmts.c (vect_model_load_cost): Use vec_construct
>         instead of scalar_to-vec.
>         * config/spu/spu.c (spu_builtin_vectorization_cost): Handle
>         vec_construct in same way as default for now.
>         * config/i386/i386.c (ix86_builtin_vectorization_cost): Likewise.
>         * config/rs6000/rs6000.c (rs6000_builtin_vectorization_cost):
>         Handle vec_construct, including special case for 32-bit loads.
>
>
> Index: gcc/targhooks.c
> ===================================================================
> --- gcc/targhooks.c     (revision 188482)
> +++ gcc/targhooks.c     (working copy)
> @@ -499,9 +499,11 @@ default_builtin_vectorized_conversion (unsigned in
>
>  int
>  default_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
> -                                    tree vectype ATTRIBUTE_UNUSED,
> +                                    tree vectype,
>                                      int misalign ATTRIBUTE_UNUSED)
>  {
> +  unsigned elements;
> +
>    switch (type_of_cost)
>      {
>        case scalar_stmt:
> @@ -524,6 +526,11 @@ default_builtin_vectorization_cost (enum vect_cost
>        case cond_branch_taken:
>          return 3;
>
> +      case vec_construct:
> +       elements = TYPE_VECTOR_SUBPARTS (vectype);
> +       gcc_assert (elements > 1);
> +       return elements / 2 + 1;
> +
>        default:
>          gcc_unreachable ();
>      }
> Index: gcc/target.h
> ===================================================================
> --- gcc/target.h        (revision 188482)
> +++ gcc/target.h        (working copy)
> @@ -146,7 +146,8 @@ enum vect_cost_for_stmt
>    cond_branch_not_taken,
>    cond_branch_taken,
>    vec_perm,
> -  vec_promote_demote
> +  vec_promote_demote,
> +  vec_construct
>  };
>
>  /* The target structure.  This holds all the backend hooks.  */
> Index: gcc/tree-vect-stmts.c
> ===================================================================
> --- gcc/tree-vect-stmts.c       (revision 188482)
> +++ gcc/tree-vect-stmts.c       (working copy)
> @@ -1031,11 +1031,13 @@ vect_model_load_cost (stmt_vec_info stmt_info, int
>    /* The loads themselves.  */
>    if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
>      {
> -      /* N scalar loads plus gathering them into a vector.
> -         ???  scalar_to_vec isn't the cost for that.  */
> +      /* N scalar loads plus gathering them into a vector.  */
> +      tree vectype = STMT_VINFO_VECTYPE (stmt_info);
>        inside_cost += (vect_get_stmt_cost (scalar_load) * ncopies
> -                     * TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)));
> -      inside_cost += ncopies * vect_get_stmt_cost (scalar_to_vec);
> +                     * TYPE_VECTOR_SUBPARTS (vectype));
> +      inside_cost += ncopies
> +       * targetm.vectorize.builtin_vectorization_cost (vec_construct,
> +                                                       vectype, 0);
>      }
>    else
>      vect_get_load_cost (first_dr, ncopies,
> Index: gcc/config/spu/spu.c
> ===================================================================
> --- gcc/config/spu/spu.c        (revision 188482)
> +++ gcc/config/spu/spu.c        (working copy)
> @@ -6908,9 +6908,11 @@ spu_builtin_mask_for_load (void)
>  /* Implement targetm.vectorize.builtin_vectorization_cost.  */
>  static int
>  spu_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
> -                                tree vectype ATTRIBUTE_UNUSED,
> +                                tree vectype,
>                                  int misalign ATTRIBUTE_UNUSED)
>  {
> +  unsigned elements;
> +
>    switch (type_of_cost)
>      {
>        case scalar_stmt:
> @@ -6937,6 +6939,11 @@ spu_builtin_vectorization_cost (enum vect_cost_for
>        case cond_branch_taken:
>          return 6;
>
> +      case vec_construct:
> +       elements = TYPE_VECTOR_SUBPARTS (vectype);
> +       gcc_assert (elements > 1);
> +       return elements / 2 + 1;
> +
>        default:
>          gcc_unreachable ();
>      }
> Index: gcc/config/i386/i386.c
> ===================================================================
> --- gcc/config/i386/i386.c      (revision 188482)
> +++ gcc/config/i386/i386.c      (working copy)
> @@ -36072,9 +36072,11 @@ static const struct attribute_spec ix86_attribute_
>  /* Implement targetm.vectorize.builtin_vectorization_cost.  */
>  static int
>  ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
> -                                 tree vectype ATTRIBUTE_UNUSED,
> +                                 tree vectype,
>                                   int misalign ATTRIBUTE_UNUSED)
>  {
> +  unsigned elements;
> +
>    switch (type_of_cost)
>      {
>        case scalar_stmt:
> @@ -36115,6 +36117,11 @@ ix86_builtin_vectorization_cost (enum vect_cost_fo
>        case vec_promote_demote:
>          return ix86_cost->vec_stmt_cost;
>
> +      case vec_construct:
> +       elements = TYPE_VECTOR_SUBPARTS (vectype);
> +       gcc_assert (elements > 1);
> +       return elements / 2 + 1;
> +
>        default:
>          gcc_unreachable ();
>      }
> Index: gcc/config/rs6000/rs6000.c
> ===================================================================
> --- gcc/config/rs6000/rs6000.c  (revision 188482)
> +++ gcc/config/rs6000/rs6000.c  (working copy)
> @@ -3405,6 +3405,7 @@ rs6000_builtin_vectorization_cost (enum vect_cost_
>                                     tree vectype, int misalign)
>  {
>    unsigned elements;
> +  tree elem_type;
>
>    switch (type_of_cost)
>      {
> @@ -3504,6 +3505,19 @@ rs6000_builtin_vectorization_cost (enum vect_cost_
>
>          return 2;
>
> +      case vec_construct:
> +       elements = TYPE_VECTOR_SUBPARTS (vectype);
> +       elem_type = TREE_TYPE (vectype);
> +       gcc_assert (elements > 1);
> +       /* 32-bit vectors loaded into registers are stored as double
> +          precision, so we need n/2 converts in addition to the usual
> +          n/2 merges to construct a vector of short floats from them.  */
> +       if (SCALAR_FLOAT_TYPE_P (elem_type)
> +           && TYPE_PRECISION (elem_type) == 32)
> +         return elements + 1;
> +       else
> +         return elements / 2 + 1;
> +
>        default:
>          gcc_unreachable ();
>      }
>
>