This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
Re: Make the vectoriser drop to strided accesses for stores with gaps
- From: Richard Biener <richard dot guenther at gmail dot com>
- To: GCC Patches <gcc-patches at gcc dot gnu dot org>, richard dot sandiford at arm dot com
- Date: Fri, 20 Jul 2018 13:08:55 +0200
- Subject: Re: Make the vectoriser drop to strided accesses for stores with gaps
- References: <87effy2n0g.fsf@arm.com>
On Fri, Jul 20, 2018 at 12:57 PM Richard Sandiford
<richard.sandiford@arm.com> wrote:
>
> We could vectorise:
>
> for (...)
> {
> a[0] = ...;
> a[1] = ...;
> a[2] = ...;
> a[3] = ...;
> a += stride;
> }
>
> (including the case when stride == 8) but not:
>
> for (...)
> {
> a[0] = ...;
> a[1] = ...;
> a[2] = ...;
> a[3] = ...;
> a += 8;
> }
>
> (where the stride is always 8). The former was treated as a "grouped
> and strided" store, while the latter was treated as grouped store with
> gaps, which we don't support.
>
> This patch makes us treat groups of stores with gaps at the end as
> strided groups too. I tried to go through all uses of STMT_VINFO_STRIDED_P
> and all vector uses of DR_STEP to see whether there were any hard-baked
> assumptions, but couldn't see any. I wondered whether we should relax:
>
> /* We do not have to consider dependences between accesses that belong
> to the same group, unless the stride could be smaller than the
> group size. */
> if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
> && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
> == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
> && !STMT_VINFO_STRIDED_P (stmtinfo_a))
> return false;
>
> for cases in which the step is constant and the absolute step is known
> to be greater than the group size, but data dependence analysis should
> already return chrec_known for those cases.
>
> The new test is a version of vect-avg-15.c with the variable step
> replaced by a constant one.
>
> A natural follow-on would be to do the same for groups with gaps in
> the middle:
>
> /* Check that the distance between two accesses is equal to the type
> size. Otherwise, we have gaps. */
> diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
> - TREE_INT_CST_LOW (prev_init)) / type_size;
> if (diff != 1)
> {
> [...]
> if (DR_IS_WRITE (data_ref))
> {
> if (dump_enabled_p ())
> dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> "interleaved store with gaps\n");
> return false;
> }
>
> But I think we should do that separately and see what the fallout
> from this change is first.
Agreed.
> Tested on aarch64-linux-gnu (with and without SVE), aarch64_be-elf
> and x86_64-linux-gnu. OK to install?
Do you need to set STMT_VINFO_STRIDED_P on all stmts? I think
it is enough to set it on the first group element.
OK otherwise.
Thanks,
Richard.
> Richard
>
>
> 2018-07-20 Richard Sandiford <richard.sandiford@arm.com>
>
> gcc/
> * tree-vect-data-refs.c (vect_analyze_group_access_1): Convert
> grouped stores with gaps to a strided group.
>
> gcc/testsuite/
> * gcc.dg/vect/vect-avg-16.c: New test.
> * gcc.dg/vect/slp-37.c: Expect the loop to be vectorized.
> * gcc.dg/vect/vect-strided-u8-i8-gap4.c,
> * gcc.dg/vect/vect-strided-u8-i8-gap4-big-array.c: Likewise for
> the second loop in main1.
>
> Index: gcc/tree-vect-data-refs.c
> ===================================================================
> --- gcc/tree-vect-data-refs.c 2018-06-30 13:44:38.567611988 +0100
> +++ gcc/tree-vect-data-refs.c 2018-07-20 11:55:22.570911497 +0100
> @@ -2632,10 +2632,14 @@ vect_analyze_group_access_1 (struct data
> if (groupsize != count
> && !DR_IS_READ (dr))
> {
> - if (dump_enabled_p ())
> - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> - "interleaved store with gaps\n");
> - return false;
> + groupsize = count;
> + STMT_VINFO_STRIDED_P (stmt_info) = true;
> + stmt_vec_info next_info = stmt_info;
> + while (DR_GROUP_NEXT_ELEMENT (next_info))
> + {
> + next_info = vinfo_for_stmt (DR_GROUP_NEXT_ELEMENT (next_info));
> + STMT_VINFO_STRIDED_P (next_info) = 1;
> + }
> }
>
> /* If there is a gap after the last load in the group it is the
> @@ -2651,6 +2655,8 @@ vect_analyze_group_access_1 (struct data
> "Detected interleaving ");
> if (DR_IS_READ (dr))
> dump_printf (MSG_NOTE, "load ");
> + else if (STMT_VINFO_STRIDED_P (stmt_info))
> + dump_printf (MSG_NOTE, "strided store ");
> else
> dump_printf (MSG_NOTE, "store ");
> dump_printf (MSG_NOTE, "of size %u starting with ",
> Index: gcc/testsuite/gcc.dg/vect/vect-avg-16.c
> ===================================================================
> --- /dev/null 2018-07-09 14:52:09.234750850 +0100
> +++ gcc/testsuite/gcc.dg/vect/vect-avg-16.c 2018-07-20 11:55:22.570911497 +0100
> @@ -0,0 +1,52 @@
> +/* { dg-additional-options "-O3" } */
> +/* { dg-require-effective-target vect_int } */
> +
> +#include "tree-vect.h"
> +
> +#define N 80
> +
> +void __attribute__ ((noipa))
> +f (signed char *restrict a, signed char *restrict b,
> + signed char *restrict c, int n)
> +{
> + for (int j = 0; j < n; ++j)
> + {
> + for (int i = 0; i < 16; ++i)
> + a[i] = (b[i] + c[i]) >> 1;
> + a += 20;
> + b += 20;
> + c += 20;
> + }
> +}
> +
> +#define BASE1 -126
> +#define BASE2 -42
> +
> +signed char a[N], b[N], c[N];
> +
> +int
> +main (void)
> +{
> + check_vect ();
> +
> + for (int i = 0; i < N; ++i)
> + {
> + a[i] = i;
> + b[i] = BASE1 + i * 3;
> + c[i] = BASE2 + i * 2;
> + asm volatile ("" ::: "memory");
> + }
> + f (a, b, c, N / 20);
> + for (int i = 0; i < N; ++i)
> + {
> + int d = (BASE1 + BASE2 + i * 5) >> 1;
> + if (a[i] != (i % 20 < 16 ? d : i))
> + __builtin_abort ();
> + }
> + return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump "vect_recog_average_pattern: detected" "vect" } } */
> +/* { dg-final { scan-tree-dump {\.AVG_FLOOR} "vect" { target vect_avg_qi } } } */
> +/* { dg-final { scan-tree-dump "Loop contains only SLP stmts" "vect" { target vect_avg_qi } } } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_avg_qi } } } */
> Index: gcc/testsuite/gcc.dg/vect/slp-37.c
> ===================================================================
> --- gcc/testsuite/gcc.dg/vect/slp-37.c 2018-05-02 08:37:49.033604262 +0100
> +++ gcc/testsuite/gcc.dg/vect/slp-37.c 2018-07-20 11:55:22.570911497 +0100
> @@ -17,8 +17,8 @@ foo1 (s1 *arr)
> int i;
> s1 *ptr = arr;
>
> - /* Different constant types - not SLPable. The group size is not power of 2,
> - interleaving is not supported either. */
> + /* Vectorized as a strided SLP pair of accesses to <a, b> and a single
> + strided access to c. */
> for (i = 0; i < N; i++)
> {
> ptr->a = 6;
> @@ -58,6 +58,5 @@ int main (void)
> return 0;
> }
>
> -/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 0 "vect" } } */
> -/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" } } */
> -
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
> +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" } } */
> Index: gcc/testsuite/gcc.dg/vect/vect-strided-u8-i8-gap4.c
> ===================================================================
> --- gcc/testsuite/gcc.dg/vect/vect-strided-u8-i8-gap4.c 2018-05-02 08:37:49.021604375 +0100
> +++ gcc/testsuite/gcc.dg/vect/vect-strided-u8-i8-gap4.c 2018-07-20 11:55:22.570911497 +0100
> @@ -53,7 +53,7 @@ main1 (s *arr)
> }
>
> ptr = arr;
> - /* Not vectorizable: gap in store. */
> + /* Vectorized as a strided SLP pair. */
> for (i = 0; i < N; i++)
> {
> res[i].a = ptr->b;
> @@ -97,5 +97,4 @@ int main (void)
> return 0;
> }
>
> -/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_strided8 } } } */
> -
> +/* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" { target vect_strided8 } } } */
> Index: gcc/testsuite/gcc.dg/vect/vect-strided-u8-i8-gap4-big-array.c
> ===================================================================
> --- gcc/testsuite/gcc.dg/vect/vect-strided-u8-i8-gap4-big-array.c 2018-05-02 08:37:49.013604450 +0100
> +++ gcc/testsuite/gcc.dg/vect/vect-strided-u8-i8-gap4-big-array.c 2018-07-20 11:55:22.570911497 +0100
> @@ -55,7 +55,7 @@ main1 (s *arr)
> }
>
> ptr = arr;
> - /* Not vectorizable: gap in store. */
> + /* Vectorized as a strided SLP pair. */
> for (i = 0; i < N; i++)
> {
> res[i].a = ptr->b;
> @@ -110,5 +110,4 @@ int main (void)
> return 0;
> }
>
> -/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_strided8 } } } */
> -
> +/* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" { target vect_strided8 } } } */