Bug 82426 - Missed tree-slp-vectorization on -O2 and -O3
Summary: Missed tree-slp-vectorization on -O2 and -O3
Status: ASSIGNED
Alias: None
Product: gcc
Classification: Unclassified
Component: tree-optimization (show other bugs)
Version: 7.2.0
: P3 enhancement
Target Milestone: ---
Assignee: Richard Biener
URL:
Keywords: missed-optimization
Depends on:
Blocks: vectorizer
  Show dependency treegraph
 
Reported: 2017-10-04 10:24 UTC by Allan Jensen
Modified: 2021-09-20 11:11 UTC (History)
1 user (show)

See Also:
Host:
Target: x86_64-*-*
Build:
Known to work:
Known to fail:
Last reconfirmed: 2021-08-25 00:00:00


Attachments
vectslp.cpp (177 bytes, text/plain)
2017-10-04 10:24 UTC, Allan Jensen
Details
Assembler output with -O3 (421 bytes, text/plain)
2017-10-04 10:25 UTC, Allan Jensen
Details
Assembler output with -Os -ftree-slp-vectorize (381 bytes, text/plain)
2017-10-04 10:25 UTC, Allan Jensen
Details

Note You need to log in before you can comment on or make changes to this bug.
Description Allan Jensen 2017-10-04 10:24:33 UTC
Created attachment 42299 [details]
vectslp.cpp

The attached example is a simple matrix multiplication. With -O3 or -O2 -ftree-slp-vectorize the basic-block is not vectorized.

Oddly, with -Os -ftree-slp-vectorize it is.
Comment 1 Allan Jensen 2017-10-04 10:25:13 UTC
Created attachment 42300 [details]
Assembler output with -O3
Comment 2 Allan Jensen 2017-10-04 10:25:36 UTC
Created attachment 42301 [details]
Assembler output with -Os -ftree-slp-vectorize
Comment 3 Allan Jensen 2017-10-04 10:33:07 UTC
Note it appears the fact it can do it at all in -Os is new in gcc 7
Comment 4 Andrew Pinski 2021-08-25 00:13:07 UTC
Hmm, on aarch64 we do a decent job at vectorizing this (since GCC 11):
        ldp     d4, d0, [x1]
        ldr     d7, [x0, 16]
        ldp     d6, d5, [x0]
        fmul    v3.2s, v0.2s, v7.s[1]
        ldr     d1, [x1, 16]
        fmul    v2.2s, v0.2s, v6.s[1]
        fmul    v0.2s, v0.2s, v5.s[1]
        fmla    v3.2s, v4.2s, v7.s[0]
        fmla    v2.2s, v4.2s, v6.s[0]
        fmla    v0.2s, v4.2s, v5.s[0]
        fadd    v1.2s, v1.2s, v3.2s
        stp     d2, d0, [x8]
        str     d1, [x8, 16]

I suspect this is because V2SF does not exist on x86_64.
Using -Dfloat=double seems to get better for x86_64 (with -mavx2):
        vmovupd (%rdx), %ymm0
        vpermilpd       $0, (%rsi), %ymm1
        movq    %rdi, %rax
        vmovsd  32(%rsi), %xmm5
        vmovsd  40(%rsi), %xmm4
        vpermpd $68, %ymm0, %ymm2
        vpermpd $238, %ymm0, %ymm3
        vmulpd  %ymm2, %ymm1, %ymm2
        vpermilpd       $15, (%rsi), %ymm1
        vmulpd  %ymm3, %ymm1, %ymm1
        vaddpd  %ymm1, %ymm2, %ymm1
        vmulsd  %xmm5, %xmm0, %xmm2
        vmovupd %ymm1, (%rdi)
        vmovapd %xmm0, %xmm1
        vextractf128    $0x1, %ymm0, %xmm0
        vmulsd  %xmm4, %xmm0, %xmm3
        vunpckhpd       %xmm1, %xmm1, %xmm1
        vunpckhpd       %xmm0, %xmm0, %xmm0
        vmulsd  %xmm5, %xmm1, %xmm1
        vmulsd  %xmm4, %xmm0, %xmm0
        vaddsd  %xmm3, %xmm2, %xmm2
        vaddsd  32(%rdx), %xmm2, %xmm2
        vaddsd  %xmm0, %xmm1, %xmm1
        vaddsd  40(%rdx), %xmm1, %xmm1
        vmovsd  %xmm2, 32(%rdi)
        vmovsd  %xmm1, 40(%rdi)
Comment 5 Richard Biener 2021-08-25 07:13:07 UTC
x86 actually does have V2SF, the issue is that there's an opportunity for V4SF vectorization and one for V2SF arriving at the same load groups and that causes a conflict (there's other PRs about this general issue), so we kill one part:

t.C:18:12: missed:   desired vector type conflicts with earlier one for _2 = b_35(D)->m11;
t.C:18:12: note:  removing SLP instance operations starting from: <retval>.dx = _27;

also we have a bunch of live lanes off the remaining vectorized piece which makes code a bit awkward.

Unfortunately we have no way to force 64bit vectors here (V2SF) to see whether splitting up the V4SFmode partition would help (I guess it would as can be
seen from using 'double').
Comment 6 Richard Biener 2021-09-20 11:11:47 UTC
I have a patch that produces

  vect__1.5_42 = MEM <const vector(4) float> [(float *)a_34(D)];
  vect__1.7_47 = VEC_PERM_EXPR <vect__1.5_42, vect__1.5_42, { 0, 0, 2, 2 }>;
  vect__2.10_49 = MEM <const vector(4) float> [(float *)b_35(D)];
  vect__2.12_53 = VEC_PERM_EXPR <vect__2.10_49, vect__2.10_49, { 0, 1, 0, 1 }>;
  vect__3.13_54 = vect__1.7_47 * vect__2.12_53;
  vect__2.30_73 = MEM <const vector(2) float> [(float *)b_35(D)];
  vect__1.18_61 = VEC_PERM_EXPR <vect__1.5_42, vect__1.5_42, { 1, 1, 3, 3 }>;
  vect__2.23_68 = VEC_PERM_EXPR <vect__2.10_49, vect__2.10_49, { 2, 3, 2, 3 }>;
  vect__6.24_69 = vect__1.18_61 * vect__2.23_68;
  vect__7.25_70 = vect__3.13_54 + vect__6.24_69;
  vect__5.40_85 = MEM <const vector(2) float> [(float *)b_35(D) + 8B];
  MEM <vector(4) float> [(float *)&<retval>] = vect__7.25_70;
  vect__21.35_81 = MEM <const vector(2) float> [(float *)a_34(D) + 16B];
  vect__1.36_82 = VEC_PERM_EXPR <vect__21.35_81, vect__21.35_81, { 0, 0 }>;
  vect__22.37_83 = vect__2.30_73 * vect__1.36_82;
  vect__1.46_94 = VEC_PERM_EXPR <vect__21.35_81, vect__21.35_81, { 1, 1 }>;
  vect__24.47_95 = vect__5.40_85 * vect__1.46_94;
  vect__25.48_96 = vect__22.37_83 + vect__24.47_95;
  vect__26.51_98 = MEM <const vector(2) float> [(float *)b_35(D) + 16B];
  vect__27.52_100 = vect__25.48_96 + vect__26.51_98;
  MEM <vector(2) float> [(float *)&<retval> + 16B] = vect__27.52_100;

that means it ends up with some odd vector loads, but with SSE 4.2 it becomes

        movups  (%rsi), %xmm5
        movups  (%rdx), %xmm1
        movq    %rdi, %rax
        movq    (%rdx), %xmm4
        movq    8(%rdx), %xmm3
        movsldup        %xmm5, %xmm0
        movaps  %xmm1, %xmm2
        movlhps %xmm1, %xmm2
        shufps  $238, %xmm1, %xmm1
        mulps   %xmm0, %xmm2
        movshdup        %xmm5, %xmm0
        mulps   %xmm1, %xmm0
        movq    16(%rsi), %xmm1
        addps   %xmm2, %xmm0
        movups  %xmm0, (%rdi)
        movsldup        %xmm1, %xmm0
        movshdup        %xmm1, %xmm1
        mulps   %xmm4, %xmm0
        mulps   %xmm3, %xmm1
        addps   %xmm1, %xmm0
        movq    16(%rdx), %xmm1
        addps   %xmm1, %xmm0
        movlps  %xmm0, 16(%rdi)

alternatively -mavx can do some of the required perms with the loads and
with -mfma we can use an FMA as well:

        vpermilps       $238, (%rdx), %xmm1
        vpermilps       $245, (%rsi), %xmm0
        movq    %rdi, %rax
        vpermilps       $160, (%rsi), %xmm3
        vpermilps       $68, (%rdx), %xmm4
        vmulps  %xmm1, %xmm0, %xmm0
        vmovq   (%rdx), %xmm2
        vfmadd231ps     %xmm4, %xmm3, %xmm0
        vmovq   8(%rdx), %xmm3
        vmovups %xmm0, (%rdi)
        vmovq   16(%rsi), %xmm0
        vmovsldup       %xmm0, %xmm1
        vmovshdup       %xmm0, %xmm0
        vmulps  %xmm3, %xmm0, %xmm0
        vfmadd132ps     %xmm1, %xmm0, %xmm2
        vmovq   16(%rdx), %xmm0
        vaddps  %xmm2, %xmm0, %xmm0
        vmovlps %xmm0, 16(%rdi)

I'm not sure whether the vmovups + vmovs{l,h}dup are any better than doing
two scalar loads + dups though - it might avoid some STLF conflict with
earlier smaller stores at least.