[gcc r15-1240] Improve code generation of strided SLP loads
Richard Biener
rguenth@gcc.gnu.org
Thu Jun 13 06:22:55 GMT 2024
https://gcc.gnu.org/g:e8f4d525cb320ff11dd95b985d8043fef0510878
commit r15-1240-ge8f4d525cb320ff11dd95b985d8043fef0510878
Author: Richard Biener <rguenther@suse.de>
Date: Mon Jun 10 15:31:35 2024 +0200
Improve code generation of strided SLP loads
This avoids falling back to elementwise accesses for strided SLP
loads when the group size is not a multiple of the vector element
size. Instead we can use a smaller vector or integer type for the load.
For stores we can do the same though restrictions on stores we handle
and the fact that store-merging covers up makes this mostly effective
for cost modeling which shows for gcc.target/i386/vect-strided-3.c
which we now vectorize with V4SI vectors rather than just V2SI ones.
For all of this there's still the opportunity to use non-uniform
accesses, say for a 6-element group with a VF of two do
V4SI, { V2SI, V2SI }, V4SI. But that's for a possible followup.
* tree-vect-stmts.cc (get_group_load_store_type): Consistently
use VMAT_STRIDED_SLP for strided SLP accesses and not
VMAT_ELEMENTWISE.
(vectorizable_store): Adjust VMAT_STRIDED_SLP handling to
allow not only half-size but also smaller accesses.
(vectorizable_load): Likewise.
* gcc.target/i386/vect-strided-1.c: New testcase.
* gcc.target/i386/vect-strided-2.c: Likewise.
* gcc.target/i386/vect-strided-3.c: Likewise.
* gcc.target/i386/vect-strided-4.c: Likewise.
Diff:
---
gcc/testsuite/gcc.target/i386/vect-strided-1.c | 24 ++++++
gcc/testsuite/gcc.target/i386/vect-strided-2.c | 17 +++++
gcc/testsuite/gcc.target/i386/vect-strided-3.c | 20 +++++
gcc/testsuite/gcc.target/i386/vect-strided-4.c | 20 +++++
gcc/tree-vect-stmts.cc | 100 ++++++++++++-------------
5 files changed, 127 insertions(+), 54 deletions(-)
diff --git a/gcc/testsuite/gcc.target/i386/vect-strided-1.c b/gcc/testsuite/gcc.target/i386/vect-strided-1.c
new file mode 100644
index 000000000000..db4a06711f11
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vect-strided-1.c
@@ -0,0 +1,24 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse2 -mno-avx" } */
+
+void foo (int * __restrict a, int *b, int s)
+{
+ for (int i = 0; i < 1024; ++i)
+ {
+ a[8*i+0] = b[s*i+0];
+ a[8*i+1] = b[s*i+1];
+ a[8*i+2] = b[s*i+2];
+ a[8*i+3] = b[s*i+3];
+ a[8*i+4] = b[s*i+4];
+ a[8*i+5] = b[s*i+5];
+ a[8*i+6] = b[s*i+4];
+ a[8*i+7] = b[s*i+5];
+ }
+}
+
+/* Three two-element loads, two four-element stores. On ia32 we elide
+ a permute and perform a redundant load. */
+/* { dg-final { scan-assembler-times "movq" 2 } } */
+/* { dg-final { scan-assembler-times "movhps" 2 { target ia32 } } } */
+/* { dg-final { scan-assembler-times "movhps" 1 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "movups" 2 } } */
diff --git a/gcc/testsuite/gcc.target/i386/vect-strided-2.c b/gcc/testsuite/gcc.target/i386/vect-strided-2.c
new file mode 100644
index 000000000000..6fd64e28cf0d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vect-strided-2.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse2 -mno-avx" } */
+
+void foo (int * __restrict a, int *b, int s)
+{
+ for (int i = 0; i < 1024; ++i)
+ {
+ a[4*i+0] = b[s*i+0];
+ a[4*i+1] = b[s*i+1];
+ a[4*i+2] = b[s*i+0];
+ a[4*i+3] = b[s*i+1];
+ }
+}
+
+/* One two-element load, one four-element store. */
+/* { dg-final { scan-assembler-times "movq" 1 } } */
+/* { dg-final { scan-assembler-times "movups" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/vect-strided-3.c b/gcc/testsuite/gcc.target/i386/vect-strided-3.c
new file mode 100644
index 000000000000..b462701a0b2e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vect-strided-3.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse2 -mno-avx -fno-tree-slp-vectorize" } */
+
+void foo (int * __restrict a, int *b, int s)
+{
+ if (s >= 6)
+ for (int i = 0; i < 1024; ++i)
+ {
+ a[s*i+0] = b[4*i+0];
+ a[s*i+1] = b[4*i+1];
+ a[s*i+2] = b[4*i+2];
+ a[s*i+3] = b[4*i+3];
+ a[s*i+4] = b[4*i+0];
+ a[s*i+5] = b[4*i+1];
+ }
+}
+
+/* While the vectorizer generates 6 uint64 stores. */
+/* { dg-final { scan-assembler-times "movq" 4 } } */
+/* { dg-final { scan-assembler-times "movhps" 2 } } */
diff --git a/gcc/testsuite/gcc.target/i386/vect-strided-4.c b/gcc/testsuite/gcc.target/i386/vect-strided-4.c
new file mode 100644
index 000000000000..dd922926a2a7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vect-strided-4.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse4.2 -mno-avx -fno-tree-slp-vectorize" } */
+
+void foo (int * __restrict a, int * __restrict b, int *c, int s)
+{
+ if (s >= 2)
+ for (int i = 0; i < 1024; ++i)
+ {
+ a[s*i+0] = c[4*i+0];
+ a[s*i+1] = c[4*i+1];
+ b[s*i+0] = c[4*i+2];
+ b[s*i+1] = c[4*i+3];
+ }
+}
+
+/* Vectorization factor two, two two-element stores to a using movq
+ and two two-element stores to b via pextrq/movhps of the high part. */
+/* { dg-final { scan-assembler-times "movq" 2 } } */
+/* { dg-final { scan-assembler-times "pextrq" 2 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "movhps" 2 { target { ia32 } } } } */
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 05fe523da993..e32d44050e53 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -2036,15 +2036,10 @@ get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info,
first_dr_info
= STMT_VINFO_DR_INFO (SLP_TREE_SCALAR_STMTS (slp_node)[0]);
if (STMT_VINFO_STRIDED_P (first_stmt_info))
- {
- /* Try to use consecutive accesses of DR_GROUP_SIZE elements,
- separated by the stride, until we have a complete vector.
- Fall back to scalar accesses if that isn't possible. */
- if (multiple_p (nunits, group_size))
- *memory_access_type = VMAT_STRIDED_SLP;
- else
- *memory_access_type = VMAT_ELEMENTWISE;
- }
+ /* Try to use consecutive accesses of as many elements as possible,
+ separated by the stride, until we have a complete vector.
+ Fall back to scalar accesses if that isn't possible. */
+ *memory_access_type = VMAT_STRIDED_SLP;
else
{
int cmp = compare_step_with_zero (vinfo, stmt_info);
@@ -8506,12 +8501,29 @@ vectorizable_store (vec_info *vinfo,
tree lvectype = vectype;
if (slp)
{
- if (group_size < const_nunits
- && const_nunits % group_size == 0)
+ HOST_WIDE_INT n = gcd (group_size, const_nunits);
+ if (n == const_nunits)
{
- nstores = const_nunits / group_size;
- lnel = group_size;
- ltype = build_vector_type (elem_type, group_size);
+ int mis_align = dr_misalignment (first_dr_info, vectype);
+ dr_alignment_support dr_align
+ = vect_supportable_dr_alignment (vinfo, dr_info, vectype,
+ mis_align);
+ if (dr_align == dr_aligned
+ || dr_align == dr_unaligned_supported)
+ {
+ nstores = 1;
+ lnel = const_nunits;
+ ltype = vectype;
+ lvectype = vectype;
+ alignment_support_scheme = dr_align;
+ misalignment = mis_align;
+ }
+ }
+ else if (n > 1)
+ {
+ nstores = const_nunits / n;
+ lnel = n;
+ ltype = build_vector_type (elem_type, n);
lvectype = vectype;
/* First check if vec_extract optab doesn't support extraction
@@ -8520,7 +8532,7 @@ vectorizable_store (vec_info *vinfo,
machine_mode vmode;
if (!VECTOR_MODE_P (TYPE_MODE (vectype))
|| !related_vector_mode (TYPE_MODE (vectype), elmode,
- group_size).exists (&vmode)
+ n).exists (&vmode)
|| (convert_optab_handler (vec_extract_optab,
TYPE_MODE (vectype), vmode)
== CODE_FOR_nothing))
@@ -8531,8 +8543,8 @@ vectorizable_store (vec_info *vinfo,
re-interpreting it as the original vector type if
supported. */
unsigned lsize
- = group_size * GET_MODE_BITSIZE (elmode);
- unsigned int lnunits = const_nunits / group_size;
+ = n * GET_MODE_BITSIZE (elmode);
+ unsigned int lnunits = const_nunits / n;
/* If we can't construct such a vector fall back to
element extracts from the original vector type and
element size stores. */
@@ -8545,7 +8557,7 @@ vectorizable_store (vec_info *vinfo,
!= CODE_FOR_nothing))
{
nstores = lnunits;
- lnel = group_size;
+ lnel = n;
ltype = build_nonstandard_integer_type (lsize, 1);
lvectype = build_vector_type (ltype, nstores);
}
@@ -8556,24 +8568,6 @@ vectorizable_store (vec_info *vinfo,
issue exists here for reasonable archs. */
}
}
- else if (group_size >= const_nunits
- && group_size % const_nunits == 0)
- {
- int mis_align = dr_misalignment (first_dr_info, vectype);
- dr_alignment_support dr_align
- = vect_supportable_dr_alignment (vinfo, dr_info, vectype,
- mis_align);
- if (dr_align == dr_aligned
- || dr_align == dr_unaligned_supported)
- {
- nstores = 1;
- lnel = const_nunits;
- ltype = vectype;
- lvectype = vectype;
- alignment_support_scheme = dr_align;
- misalignment = mis_align;
- }
- }
ltype = build_aligned_type (ltype, TYPE_ALIGN (elem_type));
ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
}
@@ -10353,34 +10347,32 @@ vectorizable_load (vec_info *vinfo,
auto_vec<tree> dr_chain;
if (memory_access_type == VMAT_STRIDED_SLP)
{
- if (group_size < const_nunits)
+ HOST_WIDE_INT n = gcd (group_size, const_nunits);
+ /* Use the target vector type if the group size is a multiple
+ of it. */
+ if (n == const_nunits)
+ {
+ nloads = 1;
+ lnel = const_nunits;
+ ltype = vectype;
+ }
+ /* Else use the biggest vector we can load the group without
+ accessing excess elements. */
+ else if (n > 1)
{
- /* First check if vec_init optab supports construction from vector
- elts directly. Otherwise avoid emitting a constructor of
- vector elements by performing the loads using an integer type
- of the same size, constructing a vector of those and then
- re-interpreting it as the original vector type. This avoids a
- huge runtime penalty due to the general inability to perform
- store forwarding from smaller stores to a larger load. */
tree ptype;
tree vtype
- = vector_vector_composition_type (vectype,
- const_nunits / group_size,
+ = vector_vector_composition_type (vectype, const_nunits / n,
&ptype);
if (vtype != NULL_TREE)
{
- nloads = const_nunits / group_size;
- lnel = group_size;
+ nloads = const_nunits / n;
+ lnel = n;
lvectype = vtype;
ltype = ptype;
}
}
- else
- {
- nloads = 1;
- lnel = const_nunits;
- ltype = vectype;
- }
+ /* Else fall back to the default element-wise access. */
ltype = build_aligned_type (ltype, TYPE_ALIGN (TREE_TYPE (vectype)));
}
/* Load vector(1) scalar_type if it's 1 element-wise vectype. */
More information about the Gcc-cvs
mailing list