Bug 106081 - missed vectorization
Summary: missed vectorization
Status: RESOLVED FIXED
Alias: None
Product: gcc
Classification: Unclassified
Component: middle-end (show other bugs)
Version: 13.0
: P3 enhancement
Target Milestone: 14.0
Assignee: Richard Biener
URL:
Keywords: missed-optimization
Depends on: 96208
Blocks: spec vectorizer
  Show dependency treegraph
 
Reported: 2022-06-24 15:56 UTC by Jan Hubicka
Modified: 2024-10-10 23:14 UTC (History)
6 users (show)

See Also:
Host:
Target:
Build:
Known to work:
Known to fail:
Last reconfirmed: 2022-06-27 00:00:00


Attachments

Note You need to log in before you can comment on or make changes to this bug.
Description Jan Hubicka 2022-06-24 15:56:40 UTC
This testcase (derived from ImageMagick)

struct pixels
{
        short a,b,c,d;
} *pixels;
struct dpixels
{
        double a,b,c,d;
};

double
test(double *k)
{
        struct dpixels results={};
        for (int u=0; u<10000;u++,k--)
        {
                results.a += *k*pixels[u].a;
                results.b += *k*pixels[u].b;
                results.c += *k*pixels[u].c;
                results.d += *k*pixels[u].d;
        }
        return results.a+results.b*2+results.c*3+results.d*4;
}

gets vectorized by clang:
test:                                   # @test
        .cfi_startproc
# %bb.0:
        movq    pixels(%rip), %rax
        vxorpd  %xmm0, %xmm0, %xmm0
        xorl    %ecx, %ecx
        .p2align        4, 0x90
.LBB0_1:                                # =>This Inner Loop Header: Depth=1
        vpmovsxwd       (%rax), %xmm1
        vbroadcastsd    (%rdi,%rcx,8), %ymm2
        addq    $8, %rax
        decq    %rcx
        vcvtdq2pd       %xmm1, %ymm1
        vfmadd231pd     %ymm2, %ymm1, %ymm0     # ymm0 = (ymm1 * ymm2) + ymm0
        cmpq    $-10000, %rcx                   # imm = 0xD8F0
        jne     .LBB0_1
# %bb.2:
        vpermilpd       $1, %xmm0, %xmm1        # xmm1 = xmm0[1,0]
        vfmadd132sd     .LCPI0_0(%rip), %xmm0, %xmm1 # xmm1 = (xmm1 * mem) + xmm0
        vextractf128    $1, %ymm0, %xmm0
        vfmadd231sd     .LCPI0_1(%rip), %xmm0, %xmm1 # xmm1 = (xmm0 * mem) + xmm1
        vpermilpd       $1, %xmm0, %xmm0        # xmm0 = xmm0[1,0]
        vfmadd132sd     .LCPI0_2(%rip), %xmm1, %xmm0 # xmm0 = (xmm0 * mem) + xmm1
        vzeroupper
        retq

but not by GCC.
Original loop is:
    0.94 :   423cb0: vmovdqu (%rsi,%rdi,8),%xmm5 // morphology.c:2984
         : 2983   if ( IsNaN(*k) ) continue;
    0.29 :   423cb5: vpermilpd $0x1,(%rcx),%xmm4
         : 2982   for (u=0; u < (ssize_t) kernel->width; u++, k--) {
    0.46 :   423cbb: add    $0x2,%rdi
    0.07 :   423cbf: add    $0xfffffffffffffff0,%rcx
         : 2984   result.red     += (*k)*k_pixels[u].red;
    0.03 :   423cc3: vpshufb %xmm12,%xmm5,%xmm6
    6.81 :   423cc8: vcvtdq2pd %xmm6,%xmm6
   13.05 :   423ccc: vfmadd231pd %xmm6,%xmm4,%xmm1
         : 2985   result.green   += (*k)*k_pixels[u].green;
   17.45 :   423cd1: vpshufb %xmm15,%xmm5,%xmm6 // morphology.c:2985
    0.33 :   423cd6: vcvtdq2pd %xmm6,%xmm6
    0.00 :   423cda: vfmadd231pd %xmm6,%xmm4,%xmm3
         : 2986   result.blue    += (*k)*k_pixels[u].blue;
   15.28 :   423cdf: vpshufb %xmm13,%xmm5,%xmm6 // morphology.c:2986
         : 2987   result.opacity += (*k)*k_pixels[u].opacity;
    0.00 :   423ce4: vpshufb %xmm8,%xmm5,%xmm5
         : 2986   result.blue    += (*k)*k_pixels[u].blue;
    0.00 :   423ce9: vcvtdq2pd %xmm6,%xmm6
         : 2987   result.opacity += (*k)*k_pixels[u].opacity;
    0.21 :   423ced: vcvtdq2pd %xmm5,%xmm5
         : 2986   result.blue    += (*k)*k_pixels[u].blue;
    0.97 :   423cf1: vfmadd231pd %xmm6,%xmm4,%xmm0
         : 2987   result.opacity += (*k)*k_pixels[u].opacity;
   19.16 :   423cf6: vfmadd231pd %xmm5,%xmm4,%xmm2 // morphology.c:2987
         : 2982   for (u=0; u < (ssize_t) kernel->width; u++, k--) {
   14.51 :   423cfb: cmp    %rdi,%rbp // morphology.c:2982
    0.00 :   423cfe: jne    423cb0 <MorphologyApply.6136+0x20c0>

Changing short to double makes it vectorized:
.L2:
        vmovupd (%rax), %ymm4
        vmovupd 64(%rax), %ymm2
        subq    $-128, %rax
        subq    $32, %rdx
        vunpcklpd       -96(%rax), %ymm4, %ymm1
        vunpckhpd       -96(%rax), %ymm4, %ymm0
        vmovupd -64(%rax), %ymm4
        vunpckhpd       -32(%rax), %ymm2, %ymm2
        vunpcklpd       -32(%rax), %ymm4, %ymm4
        vpermpd $27, 32(%rdx), %ymm3
        vpermpd $216, %ymm1, %ymm1
        vpermpd $216, %ymm0, %ymm0
        vpermpd $216, %ymm2, %ymm2
        vpermpd $216, %ymm4, %ymm4
        vunpcklpd       %ymm2, %ymm0, %ymm10
        vunpckhpd       %ymm2, %ymm0, %ymm0
        vunpckhpd       %ymm4, %ymm1, %ymm9
        vunpcklpd       %ymm4, %ymm1, %ymm1
        vpermpd $216, %ymm10, %ymm10
        vpermpd $216, %ymm0, %ymm0
        vfmadd231pd     %ymm3, %ymm10, %ymm6
        vfmadd231pd     %ymm3, %ymm0, %ymm8
        vpermpd $216, %ymm9, %ymm9
        vpermpd $216, %ymm1, %ymm1
        vfmadd231pd     %ymm3, %ymm1, %ymm5
        vfmadd231pd     %ymm3, %ymm9, %ymm7
        cmpq    %rax, %rcx
        jne     .L2

howver clang's code looks shorter:
LBB0_1:                                # =>This Inner Loop Header: Depth=1
        vbroadcastsd    (%rdi,%rcx,8), %ymm1
        vfmadd231pd     (%rax), %ymm1, %ymm0    # ymm0 = (ymm1 * mem) + ymm0
        addq    $32, %rax
        decq    %rcx
        cmpq    $-10000, %rcx                   # imm = 0xD8F0
        jne     .LBB0_1

We loop vectorize while clang slp vectorizes it seems.
Comment 1 Jan Hubicka 2022-06-24 16:14:49 UTC
This is an attempt to vectorize by hand, but it seems we do not generate vpmovsxwd for the vector short->double conversion

struct pixels
{
        short a __attribute__ ((vector_size(4*2)));
} *pixels;
struct dpixels
{
        double a __attribute__ ((vector_size(8*4)));
};
typedef double v4df __attribute__ ((vector_size (32)));

struct dpixels
test(double *k)
{
        struct dpixels results={};
        for (int u=0; u<10000;u++,k--)
        {
                results.a += *k*__builtin_convertvector (pixels[u].a, v4df);
        }
        return results;
}

clang seems to do right thing here.
Comment 2 Hongtao.liu 2022-06-26 08:08:50 UTC
Looks quite similar to PR103999
Comment 3 Richard Biener 2022-06-27 10:38:35 UTC
Might be a bit easier case than PR103999 since the accumulation is in 'double' here.  In fact it should already work, but we fail to SLP the reduction
because

t.c:14:24: missed:   Build SLP failed: not grouped load _1 = *k_50;

and non-SLP as well:

t.c:14:24: note:   ==> examining statement: _1 = *k_50;
t.c:14:24: missed:   multiple types with negative step.
t.c:14:24: missed:   not falling back to elementwise accesses
t.c:16:30: missed:   not vectorized: relevant stmt not supported: _1 = *k_50;
t.c:14:24: missed:  bad operation or unsupported loop bound.

changing k-- to k++ lets us vectorize the code but in an awkward way
(SLP still fails on us so we get interleaving).

So the first thing to fix is SLP of non-grouped loads where I think we also
have a bug for.
Comment 4 Jan Hubicka 2022-06-27 14:26:48 UTC
Thanks! It seems that imagemagick has quite few loops that inovlve consuming shorts and producing doubles. Also it would be nice to do something about __builtin_convertvector so it does not produce stupid code...
Comment 5 Richard Biener 2023-06-21 13:45:28 UTC
PR96208 is the SLP of non-grouped loads.  We now can convert short -> double and we get with the grouped load hacked and -march=znver3:

.L2:
        vmovdqu (%rax), %ymm0
        vpermq  $27, -24(%rdi), %ymm10
        addq    $32, %rax
        subq    $32, %rdi
        vpshufb %ymm7, %ymm0, %ymm0
        vpermpd $85, %ymm10, %ymm9
        vpermpd $170, %ymm10, %ymm8
        vpermpd $255, %ymm10, %ymm6
        vpmovsxwd       %xmm0, %ymm1
        vextracti128    $0x1, %ymm0, %xmm0
        vbroadcastsd    %xmm10, %ymm10
        vcvtdq2pd       %xmm1, %ymm11
        vextracti128    $0x1, %ymm1, %xmm1
        vpmovsxwd       %xmm0, %ymm0
        vcvtdq2pd       %xmm1, %ymm1
        vfmadd231pd     %ymm10, %ymm11, %ymm5
        vfmadd231pd     %ymm9, %ymm1, %ymm2
        vcvtdq2pd       %xmm0, %ymm1
        vextracti128    $0x1, %ymm0, %xmm0
        vcvtdq2pd       %xmm0, %ymm0
        vfmadd231pd     %ymm8, %ymm1, %ymm4
        vfmadd231pd     %ymm6, %ymm0, %ymm3
        cmpq    %rax, %rdx
        jne     .L2

that is, the 'short' data type forces a higher VF to us and the splat
codegen I hacked in is sub-optimal still.
Comment 6 Richard Biener 2023-06-27 08:13:28 UTC
So what's interesting is that we now get as of r14-2117-gdd86a5a69cbda4 the
following.  The odd thing is that we fail to eliminate the load permutation
{ 3 2 1 0 } even though this is a reduction group.

I _suppose_ the reason is the { 0 0 0 0 } load permutation (the "splat")
which we don't "support".  In vect_optimize_slp_pass::start_choosing_layouts
there's

      if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
        {
          /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the node
             unpermuted, record a layout that reverses this permutation.

             We would need more work to cope with loads that are internally
             permuted and also have inputs (such as masks for
             IFN_MASK_LOADs).  */
          gcc_assert (partition.layout == 0 && !m_slpg->vertices[node_i].succ);
          if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
            continue;

which means we'll keep the permute there (well, that's OK - any permute
of the permute will retain it ...).  I suspect this prevents the optimization
here.  Massaging start_choosing_layouts to allow a splat on element zero
for a non-grouped access breaks things as we try to move that permute.
So I guess this needs a new kind of layout constraint?  The permute
can absorb any permute but we cannot "move" it.

Richard?


t.c:14:18: note:   === scheduling SLP instances ===
t.c:14:18: note:   Vectorizing SLP tree:
t.c:14:18: note:   node 0x4304170 (max_nunits=16, refcnt=2) vector(4) double
t.c:14:18: note:   op template: _21 = _20 + results$d_60;
t.c:14:18: note:        stmt 0 _21 = _20 + results$d_60;
t.c:14:18: note:        stmt 1 _17 = _16 + results$c_58;
t.c:14:18: note:        stmt 2 _13 = _12 + results$b_56;
t.c:14:18: note:        stmt 3 _9 = _8 + results$a_54;
t.c:14:18: note:        children 0x43041f8 0x4304418
t.c:14:18: note:   node 0x43041f8 (max_nunits=16, refcnt=1) vector(4) double
t.c:14:18: note:   op template: _20 = _1 * _19;
t.c:14:18: note:        stmt 0 _20 = _1 * _19;
t.c:14:18: note:        stmt 1 _16 = _1 * _15;
t.c:14:18: note:        stmt 2 _12 = _1 * _11;
t.c:14:18: note:        stmt 3 _8 = _1 * _7;
t.c:14:18: note:        children 0x4304280 0x4304308
t.c:14:18: note:   node 0x4304280 (max_nunits=4, refcnt=1) vector(4) double
t.c:14:18: note:   op template: _1 = *k_50;
t.c:14:18: note:        stmt 0 _1 = *k_50;
t.c:14:18: note:        stmt 1 _1 = *k_50;
t.c:14:18: note:        stmt 2 _1 = *k_50;
t.c:14:18: note:        stmt 3 _1 = *k_50;
t.c:14:18: note:        load permutation { 0 0 0 0 }
t.c:14:18: note:   node 0x4304308 (max_nunits=16, refcnt=1) vector(4) double
t.c:14:18: note:   op template: _19 = (double) _18;
t.c:14:18: note:        stmt 0 _19 = (double) _18;
t.c:14:18: note:        stmt 1 _15 = (double) _14;
t.c:14:18: note:        stmt 2 _11 = (double) _10;
t.c:14:18: note:        stmt 3 _7 = (double) _6;
t.c:14:18: note:        children 0x4304390
t.c:14:18: note:   node 0x4304390 (max_nunits=16, refcnt=1) vector(16) short int
t.c:14:18: note:   op template: _18 = _5->d;
t.c:14:18: note:        stmt 0 _18 = _5->d;
t.c:14:18: note:        stmt 1 _14 = _5->c;
t.c:14:18: note:        stmt 2 _10 = _5->b;
t.c:14:18: note:        stmt 3 _6 = _5->a;
t.c:14:18: note:        load permutation { 3 2 1 0 }
t.c:14:18: note:   node 0x4304418 (max_nunits=4, refcnt=1) vector(4) double
t.c:14:18: note:   op template: results$d_60 = PHI <_21(5), 0.0(6)>
t.c:14:18: note:        stmt 0 results$d_60 = PHI <_21(5), 0.0(6)>
t.c:14:18: note:        stmt 1 results$c_58 = PHI <_17(5), 0.0(6)>
t.c:14:18: note:        stmt 2 results$b_56 = PHI <_13(5), 0.0(6)>
t.c:14:18: note:        stmt 3 results$a_54 = PHI <_9(5), 0.0(6)>
t.c:14:18: note:        children 0x4304170 (nil)
Comment 7 Richard Sandiford 2023-06-27 09:10:22 UTC
I don't think the splat creates a new layout, but instead a
splat should be allowed to change its layout at zero cost.
Comment 8 Jan Hubicka 2023-06-28 12:15:11 UTC
Imagemagick improved by 17% on zen3 and 11% on altra
https://lnt.opensuse.org/db_default/v4/SPEC/37550
https://lnt.opensuse.org/db_default/v4/SPEC/37543
which is cool :)

The loop is now optimized as:

.L2:
        vmovdqu16       (%rax), %zmm0
        vmovupd (%rdx), %zmm2
        addq    $64, %rax
        subq    $64, %rdx
        vpermpd %zmm2, %zmm15, %zmm9
        vpermpd %zmm2, %zmm14, %zmm8
        vpermpd %zmm2, %zmm13, %zmm7
        vpermpd %zmm2, %zmm11, %zmm2
        vpshufb %zmm12, %zmm0, %zmm0
        vpmovsxwd       %ymm0, %zmm1
        vextracti64x4   $0x1, %zmm0, %ymm0
        vpmovsxwd       %ymm0, %zmm0
        vcvtdq2pd       %ymm1, %zmm10
        vextracti32x8   $0x1, %zmm1, %ymm1
        vcvtdq2pd       %ymm1, %zmm1
        vfmadd231pd     %zmm2, %zmm10, %zmm6
        vfmadd231pd     %zmm9, %zmm1, %zmm3
        vcvtdq2pd       %ymm0, %zmm1
        vextracti32x8   $0x1, %zmm0, %ymm0
        vcvtdq2pd       %ymm0, %zmm0
        vfmadd231pd     %zmm8, %zmm1, %zmm5
        vfmadd231pd     %zmm7, %zmm0, %zmm4
        cmpq    %rax, %rcx
        jne     .L2
Comment 9 Richard Biener 2023-07-26 09:34:48 UTC
So I can adjust change_layout_cost in a bit awkward way, but it seems that
internal_node_cost would already work out that a permute can be merged into
an existing permute.

It seems that existing permutes are not recorded in the "layout".

Also vectorizable_slp_permutation_1 doesn't try to elide permutes that
are noop based on knowledge of the layout of 'node', say a permute
{ 1 0 3 2 } of a { _1, _1, _2, _2 } node would be noop.  But
change_layout_cost does MAX (count, 1) on its result anyway.

The following elides the unnecessary permutation for this special case
(but not the general case):

diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index e4430248ab5..e9048a61891 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -4389,6 +4389,19 @@ vect_optimize_slp_pass::change_layout_cost (slp_tree node,
   if (from_layout_i == to_layout_i)
     return 0;
 
+  /* When there's a uniform load permutation permutating that in any
+     way is free.  */
+  if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
+    {
+      unsigned l = SLP_TREE_LOAD_PERMUTATION (node)[0];
+      unsigned i;
+      for (i = 1; i < SLP_TREE_LOAD_PERMUTATION (node).length (); ++i)
+       if (SLP_TREE_LOAD_PERMUTATION (node)[i] != l)
+         break;
+      if (i == SLP_TREE_LOAD_PERMUTATION (node).length ())
+       return 0;
+    }
+
   auto_vec<slp_tree, 1> children (1);
   children.quick_push (node);
   auto_lane_permutation_t perm (SLP_TREE_LANES (node));


I'm not sure this is the correct place to factor in cost savings
materialization would give.  Is it?  Are explicit VEC_PERM nodes
also still there in the optimization process or are they turned
into sth implicit?
Comment 10 Richard Sandiford 2023-07-26 10:01:10 UTC
(In reply to Richard Biener from comment #9)
> So I can adjust change_layout_cost in a bit awkward way, but it seems that
> internal_node_cost would already work out that a permute can be merged into
> an existing permute.
Right.

> It seems that existing permutes are not recorded in the "layout".
They should be if they're bijective, via:

      else if (SLP_TREE_CODE (node) == VEC_PERM_EXPR
               && SLP_TREE_CHILDREN (node).length () == 1
               && (child = SLP_TREE_CHILDREN (node)[0])
               && (TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (child))
                   .is_constant (&imin)))
        {
          /* If the child has the same vector size as this node,
             reversing the permutation can make the permutation a no-op.
             In other cases it can change a true permutation into a
             full-vector extract.  */
          tmp_perm.reserve (SLP_TREE_LANES (node));
          for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
            tmp_perm.quick_push (SLP_TREE_LANE_PERMUTATION (node)[j].second);
        }

> Also vectorizable_slp_permutation_1 doesn't try to elide permutes that
> are noop based on knowledge of the layout of 'node', say a permute
> { 1 0 3 2 } of a { _1, _1, _2, _2 } node would be noop.
To do that in general, I think we'd need to value-number each
element of each node (which sounds doable).  But I guess doing
it at leaves would be better than nothing.

> But change_layout_cost does MAX (count, 1) on its result anyway.
At the moment, yes, because having from_layout_i != to_layout_i
for identical layouts would be a consistency failure.

> The following elides the unnecessary permutation for this special case
> (but not the general case):
> 
> diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
> index e4430248ab5..e9048a61891 100644
> --- a/gcc/tree-vect-slp.cc
> +++ b/gcc/tree-vect-slp.cc
> @@ -4389,6 +4389,19 @@ vect_optimize_slp_pass::change_layout_cost (slp_tree
> node,
>    if (from_layout_i == to_layout_i)
>      return 0;
>  
> +  /* When there's a uniform load permutation permutating that in any
> +     way is free.  */
> +  if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
> +    {
> +      unsigned l = SLP_TREE_LOAD_PERMUTATION (node)[0];
> +      unsigned i;
> +      for (i = 1; i < SLP_TREE_LOAD_PERMUTATION (node).length (); ++i)
> +       if (SLP_TREE_LOAD_PERMUTATION (node)[i] != l)
> +         break;
> +      if (i == SLP_TREE_LOAD_PERMUTATION (node).length ())
> +       return 0;
> +    }
> +
>    auto_vec<slp_tree, 1> children (1);
>    children.quick_push (node);
>    auto_lane_permutation_t perm (SLP_TREE_LANES (node));
> 
> I'm not sure this is the correct place to factor in cost savings
> materialization would give.  Is it?
Yeah, I think so.  The patch LGTM.  I don't know if it's worth
caching the “all the same element” result, but probably not.

> Are explicit VEC_PERM nodes also still there in the optimization
> process or are they turned into sth implicit?
They're still there.  The current algorithm inherits the old
restriction that candidate layouts must be bijective, and not
all VEC_PERM_EXPRs are.  So some VEC_PERM_EXPRs would have to
be explicit whatever happens.  Same for non-bijective load
permutations.
Comment 11 Richard Biener 2023-07-26 10:32:21 UTC
(In reply to rsandifo@gcc.gnu.org from comment #10)
> (In reply to Richard Biener from comment #9)
[...]
> > The following elides the unnecessary permutation for this special case
> > (but not the general case):
> > 
> > diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
> > index e4430248ab5..e9048a61891 100644
> > --- a/gcc/tree-vect-slp.cc
> > +++ b/gcc/tree-vect-slp.cc
> > @@ -4389,6 +4389,19 @@ vect_optimize_slp_pass::change_layout_cost (slp_tree
> > node,
> >    if (from_layout_i == to_layout_i)
> >      return 0;
> >  
> > +  /* When there's a uniform load permutation permutating that in any
> > +     way is free.  */
> > +  if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
> > +    {
> > +      unsigned l = SLP_TREE_LOAD_PERMUTATION (node)[0];
> > +      unsigned i;
> > +      for (i = 1; i < SLP_TREE_LOAD_PERMUTATION (node).length (); ++i)
> > +       if (SLP_TREE_LOAD_PERMUTATION (node)[i] != l)
> > +         break;
> > +      if (i == SLP_TREE_LOAD_PERMUTATION (node).length ())
> > +       return 0;
> > +    }
> > +
> >    auto_vec<slp_tree, 1> children (1);
> >    children.quick_push (node);
> >    auto_lane_permutation_t perm (SLP_TREE_LANES (node));
> > 
> > I'm not sure this is the correct place to factor in cost savings
> > materialization would give.  Is it?
> Yeah, I think so.  The patch LGTM.  I don't know if it's worth
> caching the “all the same element” result, but probably not.

Yeah, I thought of changing the load_permutation representation to
vec_perm_indices but then as I consider merging lane and load permutes
that would be wrong step.

I've thought of generalizing the above but then change_layout_cost
is always positive but for example when doing { 3, 2, 1, 0 } ontop
an existing load permutation of { 3, 2, 1, 0 } the cost should be
negative?

Anyway, going to test the above patch.
Comment 12 Richard Biener 2023-07-26 10:52:07 UTC
Btw, I see we actually materialize a permute before the splat:

t.c:14:24: note:   node 0x5b311c0 (max_nunits=1, refcnt=2) vector(2) double
t.c:14:24: note:   op: VEC_PERM_EXPR
t.c:14:24: note:        stmt 0 _1 = *k_50;
t.c:14:24: note:        stmt 1 _1 = *k_50;
t.c:14:24: note:        stmt 2 _1 = *k_50;
t.c:14:24: note:        stmt 3 _1 = *k_50;
t.c:14:24: note:        lane permutation { 0[3] 0[2] 0[1] 0[0] }
t.c:14:24: note:        children 0x5b30fc0
t.c:14:24: note:   node 0x5b30fc0 (max_nunits=2, refcnt=1) vector(2) double
t.c:14:24: note:   op template: _1 = *k_50;
t.c:14:24: note:        stmt 0 _1 = *k_50;
t.c:14:24: note:        stmt 1 _1 = *k_50;
t.c:14:24: note:        stmt 2 _1 = *k_50;
t.c:14:24: note:        stmt 3 _1 = *k_50;
t.c:14:24: note:        load permutation { 0 0 0 0 }

this is because vect_optimize_slp_pass::get_result_with_layout doesn't
seem to consider load permutations?  It's the value-numbering we perform
that in the end elides the redundant permute.

I have the feeling something doesn't fit together exactly, during
materialize () we apply the chosen layout to the partitions, then
eliminate redundant permutes but we still end up with
get_result_with_layout adding the above permute.

I can add the same logic as I've added to change_layout_cost also to
get_result_with_layout but as said, it feels like I'm missing something ...
Comment 13 Richard Sandiford 2023-07-26 11:00:38 UTC
(In reply to Richard Biener from comment #12)
> Btw, I see we actually materialize a permute before the splat:
> 
> t.c:14:24: note:   node 0x5b311c0 (max_nunits=1, refcnt=2) vector(2) double
> t.c:14:24: note:   op: VEC_PERM_EXPR
> t.c:14:24: note:        stmt 0 _1 = *k_50;
> t.c:14:24: note:        stmt 1 _1 = *k_50;
> t.c:14:24: note:        stmt 2 _1 = *k_50;
> t.c:14:24: note:        stmt 3 _1 = *k_50;
> t.c:14:24: note:        lane permutation { 0[3] 0[2] 0[1] 0[0] }
> t.c:14:24: note:        children 0x5b30fc0
> t.c:14:24: note:   node 0x5b30fc0 (max_nunits=2, refcnt=1) vector(2) double
> t.c:14:24: note:   op template: _1 = *k_50;
> t.c:14:24: note:        stmt 0 _1 = *k_50;
> t.c:14:24: note:        stmt 1 _1 = *k_50;
> t.c:14:24: note:        stmt 2 _1 = *k_50;
> t.c:14:24: note:        stmt 3 _1 = *k_50;
> t.c:14:24: note:        load permutation { 0 0 0 0 }
> 
> this is because vect_optimize_slp_pass::get_result_with_layout doesn't
> seem to consider load permutations?
Yeah.  That's because, if to_layout_i != from_layout_i, the caller
is asking for a different layout from the one that we picked for
the load.  If we wanted to change the load permutation in-place
for the given to_layout_i, we'd need to duplicate the load too,
which didn't seem like a good trade-off.
Comment 14 Richard Sandiford 2023-07-26 11:15:07 UTC
FWIW, changing:

          if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
            continue;

to:

          if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
            {
              partition.layout = -1;
              continue;
            }

in start_choosing_layouts fixes it for me.  We can then choose layout 1 for the splat.

I think these conditions were carried over from the old code.  Without checking further, I'm not 100% sure what effect relaxing them would have :)
Comment 15 rguenther@suse.de 2023-07-26 12:15:33 UTC
On Wed, 26 Jul 2023, rsandifo at gcc dot gnu.org wrote:

> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106081
> 
> --- Comment #14 from rsandifo at gcc dot gnu.org <rsandifo at gcc dot gnu.org> ---
> FWIW, changing:
> 
>           if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
>             continue;
> 
> to:
> 
>           if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
>             {
>               partition.layout = -1;
>               continue;
>             }
> 
> in start_choosing_layouts fixes it for me.  We can then choose layout 1 for the
> splat.
> 
> I think these conditions were carried over from the old code.  Without checking
> further, I'm not 100% sure what effect relaxing them would have :)

I've added the above condition when introducing the support for splats
recently.  The above change indeed works - I'm going to give it further
testing.
Comment 16 GCC Commits 2023-07-26 13:28:11 UTC
The master branch has been updated by Richard Biener <rguenth@gcc.gnu.org>:

https://gcc.gnu.org/g:5d09fb683a8abce49dc0992f5102aa0189f8f632

commit r14-2790-g5d09fb683a8abce49dc0992f5102aa0189f8f632
Author: Richard Biener <rguenther@suse.de>
Date:   Wed Jul 26 13:31:16 2023 +0200

    tree-optimization/106081 - elide redundant permute
    
    The following patch makes sure to elide a redundant permute that
    can be merged with existing splats represented as load permutations
    as we now do for non-grouped SLP loads.  This is the last bit
    missing to fix this PR where the main fix was already done by
    r14-2117-gdd86a5a69cbda4
    
            PR tree-optimization/106081
            * tree-vect-slp.cc (vect_optimize_slp_pass::start_choosing_layouts):
            Assign layout -1 to splats.
    
            * gcc.dg/vect/pr106081.c: New testcase.
Comment 17 Richard Biener 2023-07-26 13:28:34 UTC
Fixed now.