This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
[PATCH, RFC] First cut at using vec_construct for strided loads
- From: "William J. Schmidt" <wschmidt at linux dot vnet dot ibm dot com>
- To: gcc-patches at gcc dot gnu dot org
- Cc: rguenther at suse dot de, bergner at vnet dot ibm dot com
- Date: Tue, 12 Jun 2012 21:18:56 -0500
- Subject: [PATCH, RFC] First cut at using vec_construct for strided loads
This patch is a follow-up to the discussion generated by
http://gcc.gnu.org/ml/gcc-patches/2012-06/msg00546.html. I've added
vec_construct to the cost model for use in vect_model_load_cost, and
implemented a cost calculation that makes sense to me for PowerPC. I'm
less certain about the default, i386, and spu implementations. I took a
guess at i386 from the discussions we had, and used the same calculation
for the default and for spu. I'm hoping you or others can fill in the
blanks if I guessed badly.
The i386 cost for vec_construct is different from all the others, which
are parameterized for each processor description. This should probably
be parameterized in some way as well, but thought you'd know better than
I how that should be. Perhaps instead of
elements / 2 + 1
it should be
(elements / 2) * X + Y
where X and Y are taken from the processor description, and represent
the cost of a merge and a permute, respectively. Let me know what you
think.
Thanks,
Bill
2012-06-12 Bill Schmidt <wschmidt@linux.ibm.com>
* targhooks.c (default_builtin_vectorized_conversion): Handle
vec_construct, using vectype to base cost on subparts.
* target.h (enum vect_cost_for_stmt): Add vec_construct.
* tree-vect-stmts.c (vect_model_load_cost): Use vec_construct
instead of scalar_to-vec.
* config/spu/spu.c (spu_builtin_vectorization_cost): Handle
vec_construct in same way as default for now.
* config/i386/i386.c (ix86_builtin_vectorization_cost): Likewise.
* config/rs6000/rs6000.c (rs6000_builtin_vectorization_cost):
Handle vec_construct, including special case for 32-bit loads.
Index: gcc/targhooks.c
===================================================================
--- gcc/targhooks.c (revision 188482)
+++ gcc/targhooks.c (working copy)
@@ -499,9 +499,11 @@ default_builtin_vectorized_conversion (unsigned in
int
default_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
- tree vectype ATTRIBUTE_UNUSED,
+ tree vectype,
int misalign ATTRIBUTE_UNUSED)
{
+ unsigned elements;
+
switch (type_of_cost)
{
case scalar_stmt:
@@ -524,6 +526,11 @@ default_builtin_vectorization_cost (enum vect_cost
case cond_branch_taken:
return 3;
+ case vec_construct:
+ elements = TYPE_VECTOR_SUBPARTS (vectype);
+ gcc_assert (elements > 1);
+ return elements / 2 + 1;
+
default:
gcc_unreachable ();
}
Index: gcc/target.h
===================================================================
--- gcc/target.h (revision 188482)
+++ gcc/target.h (working copy)
@@ -146,7 +146,8 @@ enum vect_cost_for_stmt
cond_branch_not_taken,
cond_branch_taken,
vec_perm,
- vec_promote_demote
+ vec_promote_demote,
+ vec_construct
};
/* The target structure. This holds all the backend hooks. */
Index: gcc/tree-vect-stmts.c
===================================================================
--- gcc/tree-vect-stmts.c (revision 188482)
+++ gcc/tree-vect-stmts.c (working copy)
@@ -1031,11 +1031,13 @@ vect_model_load_cost (stmt_vec_info stmt_info, int
/* The loads themselves. */
if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
{
- /* N scalar loads plus gathering them into a vector.
- ??? scalar_to_vec isn't the cost for that. */
+ /* N scalar loads plus gathering them into a vector. */
+ tree vectype = STMT_VINFO_VECTYPE (stmt_info);
inside_cost += (vect_get_stmt_cost (scalar_load) * ncopies
- * TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)));
- inside_cost += ncopies * vect_get_stmt_cost (scalar_to_vec);
+ * TYPE_VECTOR_SUBPARTS (vectype));
+ inside_cost += ncopies
+ * targetm.vectorize.builtin_vectorization_cost (vec_construct,
+ vectype, 0);
}
else
vect_get_load_cost (first_dr, ncopies,
Index: gcc/config/spu/spu.c
===================================================================
--- gcc/config/spu/spu.c (revision 188482)
+++ gcc/config/spu/spu.c (working copy)
@@ -6908,9 +6908,11 @@ spu_builtin_mask_for_load (void)
/* Implement targetm.vectorize.builtin_vectorization_cost. */
static int
spu_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
- tree vectype ATTRIBUTE_UNUSED,
+ tree vectype,
int misalign ATTRIBUTE_UNUSED)
{
+ unsigned elements;
+
switch (type_of_cost)
{
case scalar_stmt:
@@ -6937,6 +6939,11 @@ spu_builtin_vectorization_cost (enum vect_cost_for
case cond_branch_taken:
return 6;
+ case vec_construct:
+ elements = TYPE_VECTOR_SUBPARTS (vectype);
+ gcc_assert (elements > 1);
+ return elements / 2 + 1;
+
default:
gcc_unreachable ();
}
Index: gcc/config/i386/i386.c
===================================================================
--- gcc/config/i386/i386.c (revision 188482)
+++ gcc/config/i386/i386.c (working copy)
@@ -36072,9 +36072,11 @@ static const struct attribute_spec ix86_attribute_
/* Implement targetm.vectorize.builtin_vectorization_cost. */
static int
ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
- tree vectype ATTRIBUTE_UNUSED,
+ tree vectype,
int misalign ATTRIBUTE_UNUSED)
{
+ unsigned elements;
+
switch (type_of_cost)
{
case scalar_stmt:
@@ -36115,6 +36117,11 @@ ix86_builtin_vectorization_cost (enum vect_cost_fo
case vec_promote_demote:
return ix86_cost->vec_stmt_cost;
+ case vec_construct:
+ elements = TYPE_VECTOR_SUBPARTS (vectype);
+ gcc_assert (elements > 1);
+ return elements / 2 + 1;
+
default:
gcc_unreachable ();
}
Index: gcc/config/rs6000/rs6000.c
===================================================================
--- gcc/config/rs6000/rs6000.c (revision 188482)
+++ gcc/config/rs6000/rs6000.c (working copy)
@@ -3405,6 +3405,7 @@ rs6000_builtin_vectorization_cost (enum vect_cost_
tree vectype, int misalign)
{
unsigned elements;
+ tree elem_type;
switch (type_of_cost)
{
@@ -3504,6 +3505,19 @@ rs6000_builtin_vectorization_cost (enum vect_cost_
return 2;
+ case vec_construct:
+ elements = TYPE_VECTOR_SUBPARTS (vectype);
+ elem_type = TREE_TYPE (vectype);
+ gcc_assert (elements > 1);
+ /* 32-bit vectors loaded into registers are stored as double
+ precision, so we need n/2 converts in addition to the usual
+ n/2 merges to construct a vector of short floats from them. */
+ if (SCALAR_FLOAT_TYPE_P (elem_type)
+ && TYPE_PRECISION (elem_type) == 32)
+ return elements + 1;
+ else
+ return elements / 2 + 1;
+
default:
gcc_unreachable ();
}