[gcc r15-1239] tree-optimization/115385 - handle more gaps with peeling of a single iteration
Richard Biener
rguenth@gcc.gnu.org
Thu Jun 13 06:22:49 GMT 2024
https://gcc.gnu.org/g:6669dc51515313dd1e60c493596dbc90429fc362
commit r15-1239-g6669dc51515313dd1e60c493596dbc90429fc362
Author: Richard Biener <rguenther@suse.de>
Date: Fri Jun 7 14:47:12 2024 +0200
tree-optimization/115385 - handle more gaps with peeling of a single iteration
The following makes peeling of a single scalar iteration handle more
gaps, including non-power-of-two cases. This can be done by rounding
up the remaining access to the next power-of-two which ensures that
the next scalar iteration will pick at least the number of excess
elements we access.
I've added a correctness testcase and one x86 specific scanning for
the optimization.
PR tree-optimization/115385
* tree-vect-stmts.cc (get_group_load_store_type): Peeling
of a single scalar iteration is sufficient if we can narrow
the access to the next power of two of the bits in the last
access.
(vectorizable_load): Ensure that the last access is narrowed.
* gcc.dg/vect/pr115385.c: New testcase.
* gcc.target/i386/vect-pr115385.c: Likewise.
Diff:
---
gcc/testsuite/gcc.dg/vect/pr115385.c | 88 +++++++++++++++++++++++++++
gcc/testsuite/gcc.target/i386/vect-pr115385.c | 53 ++++++++++++++++
gcc/tree-vect-stmts.cc | 44 ++++++++++++--
3 files changed, 180 insertions(+), 5 deletions(-)
diff --git a/gcc/testsuite/gcc.dg/vect/pr115385.c b/gcc/testsuite/gcc.dg/vect/pr115385.c
new file mode 100644
index 000000000000..a18cd665d7d0
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/pr115385.c
@@ -0,0 +1,88 @@
+/* { dg-require-effective-target mmap } */
+
+#include <sys/mman.h>
+#include <stdio.h>
+
+#define COUNT 511
+#define MMAP_SIZE 0x20000
+#define ADDRESS 0x1122000000
+#define TYPE unsigned char
+
+#ifndef MAP_ANONYMOUS
+#define MAP_ANONYMOUS MAP_ANON
+#endif
+
+void __attribute__((noipa)) foo(TYPE * __restrict x,
+ TYPE *y, int n)
+{
+ for (int i = 0; i < n; ++i)
+ {
+ x[16*i+0] = y[3*i+0];
+ x[16*i+1] = y[3*i+1];
+ x[16*i+2] = y[3*i+2];
+ x[16*i+3] = y[3*i+0];
+ x[16*i+4] = y[3*i+1];
+ x[16*i+5] = y[3*i+2];
+ x[16*i+6] = y[3*i+0];
+ x[16*i+7] = y[3*i+1];
+ x[16*i+8] = y[3*i+2];
+ x[16*i+9] = y[3*i+0];
+ x[16*i+10] = y[3*i+1];
+ x[16*i+11] = y[3*i+2];
+ x[16*i+12] = y[3*i+0];
+ x[16*i+13] = y[3*i+1];
+ x[16*i+14] = y[3*i+2];
+ x[16*i+15] = y[3*i+0];
+ }
+}
+
+void __attribute__((noipa)) bar(TYPE * __restrict x,
+ TYPE *y, int n)
+{
+ for (int i = 0; i < n; ++i)
+ {
+ x[16*i+0] = y[5*i+0];
+ x[16*i+1] = y[5*i+1];
+ x[16*i+2] = y[5*i+2];
+ x[16*i+3] = y[5*i+3];
+ x[16*i+4] = y[5*i+4];
+ x[16*i+5] = y[5*i+0];
+ x[16*i+6] = y[5*i+1];
+ x[16*i+7] = y[5*i+2];
+ x[16*i+8] = y[5*i+3];
+ x[16*i+9] = y[5*i+4];
+ x[16*i+10] = y[5*i+0];
+ x[16*i+11] = y[5*i+1];
+ x[16*i+12] = y[5*i+2];
+ x[16*i+13] = y[5*i+3];
+ x[16*i+14] = y[5*i+4];
+ x[16*i+15] = y[5*i+0];
+ }
+}
+
+TYPE x[COUNT * 16];
+
+int
+main (void)
+{
+ void *y;
+ TYPE *end_y;
+
+ y = mmap ((void *) ADDRESS, MMAP_SIZE, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ if (y == MAP_FAILED)
+ {
+ perror ("mmap");
+ return 1;
+ }
+
+ end_y = (TYPE *) ((char *) y + MMAP_SIZE);
+
+ foo (x, end_y - COUNT * 3, COUNT);
+ bar (x, end_y - COUNT * 5, COUNT);
+
+ return 0;
+}
+
+/* We always require a scalar epilogue here but we don't know which
+ targets support vector composition this way. */
diff --git a/gcc/testsuite/gcc.target/i386/vect-pr115385.c b/gcc/testsuite/gcc.target/i386/vect-pr115385.c
new file mode 100644
index 000000000000..a6be9ce4e543
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vect-pr115385.c
@@ -0,0 +1,53 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -msse4.1 -mno-avx -fdump-tree-vect-details" } */
+
+void __attribute__((noipa)) foo(unsigned char * __restrict x,
+ unsigned char *y, int n)
+{
+ for (int i = 0; i < n; ++i)
+ {
+ x[16*i+0] = y[3*i+0];
+ x[16*i+1] = y[3*i+1];
+ x[16*i+2] = y[3*i+2];
+ x[16*i+3] = y[3*i+0];
+ x[16*i+4] = y[3*i+1];
+ x[16*i+5] = y[3*i+2];
+ x[16*i+6] = y[3*i+0];
+ x[16*i+7] = y[3*i+1];
+ x[16*i+8] = y[3*i+2];
+ x[16*i+9] = y[3*i+0];
+ x[16*i+10] = y[3*i+1];
+ x[16*i+11] = y[3*i+2];
+ x[16*i+12] = y[3*i+0];
+ x[16*i+13] = y[3*i+1];
+ x[16*i+14] = y[3*i+2];
+ x[16*i+15] = y[3*i+0];
+ }
+}
+
+void __attribute__((noipa)) bar(unsigned char * __restrict x,
+ unsigned char *y, int n)
+{
+ for (int i = 0; i < n; ++i)
+ {
+ x[16*i+0] = y[5*i+0];
+ x[16*i+1] = y[5*i+1];
+ x[16*i+2] = y[5*i+2];
+ x[16*i+3] = y[5*i+3];
+ x[16*i+4] = y[5*i+4];
+ x[16*i+5] = y[5*i+0];
+ x[16*i+6] = y[5*i+1];
+ x[16*i+7] = y[5*i+2];
+ x[16*i+8] = y[5*i+3];
+ x[16*i+9] = y[5*i+4];
+ x[16*i+10] = y[5*i+0];
+ x[16*i+11] = y[5*i+1];
+ x[16*i+12] = y[5*i+2];
+ x[16*i+13] = y[5*i+3];
+ x[16*i+14] = y[5*i+4];
+ x[16*i+15] = y[5*i+0];
+ }
+}
+
+/* { dg-final { scan-tree-dump "Data access with gaps requires scalar epilogue loop" "vect"} } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect"} } */
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index b151f53c2fbd..05fe523da993 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -2151,11 +2151,24 @@ get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info,
nunits, &tem, &remain)
|| maybe_lt (remain + group_size, nunits)))
{
- if (dump_enabled_p ())
- dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
- "peeling for gaps insufficient for "
- "access\n");
- return false;
+ /* But peeling a single scalar iteration is enough if
+ we can use the next power-of-two sized partial
+ access. */
+ unsigned HOST_WIDE_INT cnunits, cvf, cremain, cpart_size;
+ if (!nunits.is_constant (&cnunits)
+ || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&cvf)
+ || ((cremain = remain.to_constant (), true)
+ && ((cpart_size = (1 << ceil_log2 (cremain))) != cnunits)
+ && vector_vector_composition_type
+ (vectype, cnunits / cpart_size,
+ &half_vtype) == NULL_TREE))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "peeling for gaps insufficient for "
+ "access\n");
+ return false;
+ }
}
/* If this is single-element interleaving with an element
@@ -11584,6 +11597,27 @@ vectorizable_load (vec_info *vinfo,
gcc_assert (new_vtype
|| LOOP_VINFO_PEELING_FOR_GAPS
(loop_vinfo));
+ /* But still reduce the access size to the next
+ required power-of-two so peeling a single
+ scalar iteration is sufficient. */
+ unsigned HOST_WIDE_INT cremain;
+ if (remain.is_constant (&cremain))
+ {
+ unsigned HOST_WIDE_INT cpart_size
+ = 1 << ceil_log2 (cremain);
+ if (known_gt (nunits, cpart_size)
+ && constant_multiple_p (nunits, cpart_size,
+ &num))
+ {
+ tree ptype;
+ new_vtype
+ = vector_vector_composition_type (vectype,
+ num,
+ &ptype);
+ if (new_vtype)
+ ltype = ptype;
+ }
+ }
}
}
tree offset
More information about the Gcc-cvs
mailing list