Bug 99412 - s352 benchmark of TSVC is vectorized by clang and not by gcc
Summary: s352 benchmark of TSVC is vectorized by clang and not by gcc
Status: ASSIGNED
Alias: None
Product: gcc
Classification: Unclassified
Component: tree-optimization (show other bugs)
Version: 11.0
: P3 normal
Target Milestone: ---
Assignee: Richard Biener
URL:
Keywords: missed-optimization
Depends on: 97832
Blocks: vectorizer
  Show dependency treegraph
 
Reported: 2021-03-05 14:54 UTC by Jan Hubicka
Modified: 2021-03-08 08:32 UTC (History)
2 users (show)

See Also:
Host:
Target:
Build:
Known to work:
Known to fail:
Last reconfirmed: 2021-03-08 00:00:00


Attachments

Note You need to log in before you can comment on or make changes to this bug.
Description Jan Hubicka 2021-03-05 14:54:32 UTC
typedef float real_t;

#define iterations 100000
#define LEN_1D 32000
#define LEN_2D 256

real_t a[LEN_1D],b[LEN_1D];
int main ()
{

//    loop rerolling
//    unrolled dot product

    real_t dot;
    for (int nl = 0; nl < 8*iterations; nl++) {
        dot = (real_t)0.;
        for (int i = 0; i < LEN_1D; i += 5) {
            dot = dot + a[i] * b[i] + a[i + 1] * b[i + 1] + a[i + 2]
                * b[i + 2] + a[i + 3] * b[i + 3] + a[i + 4] * b[i + 4];
        }
    }

    return dot;
}


clang does:
main:                                   # @main
        .cfi_startproc
# %bb.0:
        xorl    %eax, %eax
        .p2align        4, 0x90
.LBB0_1:                                # =>This Loop Header: Depth=1
                                        #     Child Loop BB0_2 Depth 2
        vxorps  %xmm0, %xmm0, %xmm0
        movq    $-5, %rcx
        .p2align        4, 0x90
.LBB0_2:                                #   Parent Loop BB0_1 Depth=1
                                        # =>  This Inner Loop Header: Depth=2
        vmovups b+20(,%rcx,4), %xmm1
        vmovss  b+36(,%rcx,4), %xmm2            # xmm2 = mem[0],zero,zero,zero
        vmulps  a+20(,%rcx,4), %xmm1, %xmm1
        vpermilpd       $1, %xmm1, %xmm3        # xmm3 = xmm1[1,0]
        vaddps  %xmm3, %xmm1, %xmm1
        vmovshdup       %xmm1, %xmm3            # xmm3 = xmm1[1,1,3,3]
        vaddss  %xmm3, %xmm1, %xmm1
        vfmadd231ss     a+36(,%rcx,4), %xmm2, %xmm1 # xmm1 = (xmm2 * mem) + xmm1
        addq    $5, %rcx
        vaddss  %xmm0, %xmm1, %xmm0
        cmpq    $31995, %rcx                    # imm = 0x7CFB
        jb      .LBB0_2
# %bb.3:                                #   in Loop: Header=BB0_1 Depth=1
        incl    %eax
        cmpl    $800000, %eax                   # imm = 0xC3500
        jne     .LBB0_1
# %bb.4:
        vcvttss2si      %xmm0, %eax
        retq
Comment 1 Richard Biener 2021-03-08 08:32:46 UTC
With -fno-tree-reassoc we detect the reduction chain and produce

.L3:
        vmovaps b(%rax), %ymm5
        vmovaps b+32(%rax), %ymm6
        addq    $160, %rax
        vfmadd231ps     a-160(%rax), %ymm5, %ymm1
        vmovaps b-96(%rax), %ymm7
        vfmadd231ps     a-128(%rax), %ymm6, %ymm0
        vmovaps b-64(%rax), %ymm5
        vmovaps b-32(%rax), %ymm6
        vfmadd231ps     a-96(%rax), %ymm7, %ymm2
        vfmadd231ps     a-64(%rax), %ymm5, %ymm3
        vfmadd231ps     a-32(%rax), %ymm6, %ymm4
        cmpq    $128000, %rax
        jne     .L3
        vaddps  %ymm1, %ymm0, %ymm0
        vaddps  %ymm2, %ymm0, %ymm0
        vaddps  %ymm3, %ymm0, %ymm0
        vaddps  %ymm4, %ymm0, %ymm0
        vextractf128    $0x1, %ymm0, %xmm1
        vaddps  %xmm0, %xmm1, %xmm1
        vmovhlps        %xmm1, %xmm1, %xmm0
        vaddps  %xmm1, %xmm0, %xmm0
        vshufps $85, %xmm0, %xmm0, %xmm1
        vaddps  %xmm0, %xmm1, %xmm0
        decl    %edx
        jne     .L2

we're not re-rolling and thus are forced to use a VF of 4 here.

Note that LLVM doesn't seem to veectorize the loop but instead vectorizes
the basic-block which isn't what TSVC looks for (but that would work for
non-fast-math).