Bug 96275 - Vectorizer doesn't take into account bitmask condition from branch conditions.
Summary: Vectorizer doesn't take into account bitmask condition from branch conditions.
Status: NEW
Alias: None
Product: gcc
Classification: Unclassified
Component: tree-optimization (show other bugs)
Version: 11.0
: P3 normal
Target Milestone: ---
Assignee: Not yet assigned to anyone
URL:
Keywords: missed-optimization
Depends on:
Blocks: vectorizer
  Show dependency treegraph
 
Reported: 2020-07-22 02:57 UTC by Witold Baryluk
Modified: 2020-12-28 16:36 UTC (History)
3 users (show)

See Also:
Host:
Target:
Build:
Known to work:
Known to fail:
Last reconfirmed: 2020-07-22 00:00:00


Attachments

Note You need to log in before you can comment on or make changes to this bug.
Description Witold Baryluk 2020-07-22 02:57:13 UTC
https://godbolt.org/z/Gfebjd

With gcc trunk 20200720

If the loop to be vectorized is inside a if condition that check for loop counter, or there is preceding assert / function return on such condition, the gcc seems to forgot about it and not take into account in the optimizer / vectorizer, and still emits the backup scalar code to take care of stragglers despite it being a dead code.

#include "assert.h"

void fillArray(const unsigned int N, float * restrict a, const float* restrict b, const float* restrict c) {
    //assert(N >= 1024);
    for (int i = 0; i < (N & ~31u); i++) {
        a[i] = b[0] * c[i];
    }
}


produces:

fillArray:
        and     edi, -32
        je      .L8
        shr     edi, 3
        vbroadcastss    ymm1, DWORD PTR [rdx]
        xor     eax, eax
        mov     edx, edi
        sal     rdx, 5
.L3:
        vmulps  ymm0, ymm1, YMMWORD PTR [rcx+rax]
        vmovups YMMWORD PTR [rsi+rax], ymm0
        add     rax, 32
        cmp     rax, rdx
        jne     .L3
        vzeroupper
.L8:
        ret




but:

#include "assert.h"

void fillArray(const unsigned int N, float * restrict a, const float* restrict b, const float* restrict c) {
    //assert(N >= 1024);
    if ((N & 31u) == 0) {
        for (int i = 0; i < N; i++) {
            a[i] = b[0] * c[i];
        }
    }
}

produces this sub-optimal code:

fillArray:
        mov     eax, edi
        and     eax, 31
        jne     .L14
        test    edi, edi
        je      .L14
        lea     r8d, [rdi-1]
        vmovss  xmm1, DWORD PTR [rdx]
        cmp     r8d, 6
        jbe     .L8
        mov     edx, edi
        vbroadcastss    ymm2, xmm1
        xor     eax, eax
        shr     edx, 3
        sal     rdx, 5
.L4:
        vmulps  ymm0, ymm2, YMMWORD PTR [rcx+rax]
        vmovups YMMWORD PTR [rsi+rax], ymm0
        add     rax, 32
        cmp     rdx, rax
        jne     .L4
        mov     eax, edi
        and     eax, -8
        mov     edx, eax
        cmp     edi, eax
        je      .L16
        vzeroupper
.L3:
        mov     r9d, edi
        sub     r8d, eax
        sub     r9d, eax
        cmp     r8d, 2
        jbe     .L6
        mov     eax, eax
        vshufps xmm0, xmm1, xmm1, 0
        vmulps  xmm0, xmm0, XMMWORD PTR [rcx+rax*4]
        vmovups XMMWORD PTR [rsi+rax*4], xmm0
        mov     eax, r9d
        and     eax, -4
        add     edx, eax
        cmp     r9d, eax
        je      .L14
.L6:
        movsx   rax, edx
        vmulss  xmm0, xmm1, DWORD PTR [rcx+rax*4]
        vmovss  DWORD PTR [rsi+rax*4], xmm0
        lea     eax, [rdx+1]
        cmp     edi, eax
        jbe     .L14
        cdqe
        add     edx, 2
        vmulss  xmm0, xmm1, DWORD PTR [rcx+rax*4]
        vmovss  DWORD PTR [rsi+rax*4], xmm0
        cmp     edi, edx
        jbe     .L14
        movsx   rdx, edx
        vmulss  xmm1, xmm1, DWORD PTR [rcx+rdx*4]
        vmovss  DWORD PTR [rsi+rdx*4], xmm1
.L14:
        ret
.L16:
        vzeroupper
        ret
.L8:
        xor     edx, edx
        jmp     .L3


Adding `assert(N == (N & ~31u));` doesn't help.
Comment 1 Witold Baryluk 2020-07-22 03:14:59 UTC
FYI.

clang trunk 12 / 76a0c0ee6ffa9c38485776921948d8f930109674, doesn't do that either:

fillArray:                              # @fillArray
        test    dil, 31
        jne     .LBB0_8
        test    edi, edi
        je      .LBB0_8
        vmovss  xmm0, dword ptr [rdx]           # xmm0 = mem[0],zero,zero,zero
        mov     eax, edi
        cmp     edi, 32
        jae     .LBB0_4
        xor     edx, edx
        jmp     .LBB0_7
.LBB0_4:
        vbroadcastss    ymm1, xmm0
        mov     edx, eax
        xor     edi, edi
        and     edx, -32
.LBB0_5:                                # =>This Inner Loop Header: Depth=1
        vmulps  ymm2, ymm1, ymmword ptr [rcx + 4*rdi]
        vmulps  ymm3, ymm1, ymmword ptr [rcx + 4*rdi + 32]
        vmulps  ymm4, ymm1, ymmword ptr [rcx + 4*rdi + 64]
        vmulps  ymm5, ymm1, ymmword ptr [rcx + 4*rdi + 96]
        vmovups ymmword ptr [rsi + 4*rdi], ymm2
        vmovups ymmword ptr [rsi + 4*rdi + 32], ymm3
        vmovups ymmword ptr [rsi + 4*rdi + 64], ymm4
        vmovups ymmword ptr [rsi + 4*rdi + 96], ymm5
        add     rdi, 32
        cmp     rdx, rdi
        jne     .LBB0_5
        cmp     rdx, rax
        je      .LBB0_8
.LBB0_7:                                # =>This Inner Loop Header: Depth=1
        vmulss  xmm1, xmm0, dword ptr [rcx + 4*rdx]
        vmovss  dword ptr [rsi + 4*rdx], xmm1
        inc     rdx
        cmp     rax, rdx
        jne     .LBB0_7
.LBB0_8:
        vzeroupper
        ret


the main inner loop is unrolled / pipelined more aggressively, and the fallback code is simpler (just handle scalars scalarly), which is unrelated. But the fallback code is still there.



Changing to different variations of the condition, like `if ((N/32)*32 == N) {`, `if ((N % 32) == 0) {`, `if ((N & ~31u) == N) {`, `if ((N >> 5) << 5 == N) {`,  doesn't make any difference.

I tried with signed int, and unsigned int. Same effect.

Reassigning to N (after removing constness), i.e. `N = N & ~31u`, or `N = (N >> 5) << 5`, does appear to do something, but if it is inside the condition it is already too late.
Comment 2 Richard Biener 2020-07-22 08:05:44 UTC
niter analysis does not provide sth like nonzero bits so the vectorizer tries
to derive this on its own but from the niter expression, not by looking
at dominating conditions.

IMHO it belongs in niter analysis (but there the difficulty is with
latch vs. header executions we track).
Comment 3 Witold Baryluk 2020-12-27 23:53:02 UTC
Thanks for looking into that. I just wanted to update that this still suboptimal in current gcc trunk 20201226. While clang produces superior code.