[Bug target/88570] Missing or ineffective vectorization of scatter load

crazylht at gmail dot com gcc-bugzilla@gcc.gnu.org
Sat Jan 28 03:29:21 GMT 2023


https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88570

--- Comment #6 from Hongtao.liu <crazylht at gmail dot com> ---
1. knot should be cheaper than vector compare to mask register.
2. for test2, we failed to eliminate
  vcmppd  k1, ymm1, ymm2, 1
which is exactlt the same as 
  vcmppd  k1, ymm2, ymm1, 14


Note: pass_combine failed to generate zero-maskig since zero vector is still
used by the condition n1[n] > 0(0.0), if we change condition to a non-zero
constant, then zero-masking will be generated.


void test1(int*__restrict n1, int*__restrict n2,
    int*__restrict n3, int*__restrict n4)
{
    for (int n = 0; n < 8; ++n)
    {
        if (n1[n] > 1) --- change from 0 -> 1.
            n2[n] = n3[n];
        else
            n2[n] = n4[n];
    }
}
test1:
        vmovdqu ymm1, YMMWORD PTR [rdi]
        mov     eax, 1
        vpbroadcastd    ymm0, eax
        vpcmpd  k1, ymm1, ymm0, 6
        vpcmpd  k2, ymm1, ymm0, 2
        vmovdqu32       ymm2{k1}{z}, YMMWORD PTR [rdx]
        vmovdqu32       ymm0{k2}{z}, YMMWORD PTR [rcx]
        vmovdqa32       ymm0{k1}, ymm2
        vmovdqu YMMWORD PTR [rsi], ymm0
        vzeroupper
        ret


More information about the Gcc-bugs mailing list