Bug 117776 - [12/13/14/15 Regression] Missed optimization/vectorization opportunity (adding a even/odd check to an accumulator)
Summary: [12/13/14/15 Regression] Missed optimization/vectorization opportunity (addin...
Status: RESOLVED FIXED
Alias: None
Product: gcc
Classification: Unclassified
Component: tree-optimization (show other bugs)
Version: 14.2.0
: P2 enhancement
Target Milestone: 15.0
Assignee: Andrew Pinski
URL: https://gcc.gnu.org/pipermail/gcc-pat...
Keywords: missed-optimization
Depends on:
Blocks: vectorizer
  Show dependency treegraph
 
Reported: 2024-11-25 20:54 UTC by Ionuț Nicula
Modified: 2024-11-28 04:26 UTC (History)
0 users

See Also:
Host:
Target:
Build:
Known to work: 7.5.0, 8.5.0
Known to fail: 10.1.0
Last reconfirmed: 2024-11-25 00:00:00


Attachments

Note You need to log in before you can comment on or make changes to this bug.
Description Ionuț Nicula 2024-11-25 20:54:23 UTC
Consider the following code:

    #include <cstddef>
    #include <cstdint>
    
    using u64 = uint64_t;
    using u8  = uint8_t;
    
    const auto is_even_bool = [](u8 n) -> bool {
        return n % 2 == 0;
    };
    
    const auto is_even_u64 = [](u8 n) -> u64 {
        return n % 2 == 0;
    };
    
    const auto is_even_convert = [](u8 n) -> u64 {
        return is_even_bool(n);
    };
    
    u64 count_even_v1(u8 *data, size_t len)
    {
        u64 ret = 0;
        for (size_t i = 0; i < len; i++)
            ret += is_even_bool(data[i]); // Not vectorized
        return ret;
    }
    
    u64 count_even_v2(u8 *data, size_t len)
    {
        u64 ret = 0;
        for (size_t i = 0; i < len; i++)
            ret += is_even_u64(data[i]); // Vectorized
        return ret;
    }
    
    u64 count_even_v3(u8 *data, size_t len)
    {
        u64 ret = 0;
        for (size_t i = 0; i < len; i++)
            ret += is_even_convert(data[i]); // Not vectorized
        return ret;
    }
    
    u64 count_even_v4(u8 *data, size_t len)
    {
        u64 ret = 0;
        for (size_t i = 0; i < len; i++)
            ret += static_cast<u64>(is_even_bool(data[i])); // Not vectorized
        return ret;
    }

The following assembly is generated by g++ 14.2, with flags -O3 -march=skylake-avx512 (godbolt: https://godbolt.org/z/c7W9G6WbW):

    count_even_v1(unsigned char*, unsigned long):
            test    rsi, rsi
            je      .L4
            add     rsi, rdi
            xor     edx, edx
    .L3:
            movzx   eax, BYTE PTR [rdi]
            inc     rdi
            not     eax
            and     eax, 1
            add     rdx, rax
            cmp     rsi, rdi
            jne     .L3
            mov     rax, rdx
            ret
    .L4:
            xor     edx, edx
            mov     rax, rdx
            ret
    count_even_v2(unsigned char*, unsigned long):
            mov     rcx, rsi
            test    rsi, rsi
            je      .L15
            lea     rax, [rsi-1]
            cmp     rax, 30
            jbe     .L16
            mov     rdx, rsi
            mov     r8d, 16843009
            vpxor   xmm5, xmm5, xmm5
            mov     rax, rdi
            and     rdx, -32
            vpbroadcastd    ymm6, r8d
            lea     rsi, [rdx+rdi]
    .L10:
            vmovdqa ymm0, ymm6
            add     rax, 32
            vpternlogd      ymm0, ymm0, YMMWORD PTR [rax-32], $0x44
            vpmovzxbw       ymm1, xmm0
            vextracti32x4   xmm0, ymm0, 0x1
            vpmovzxwd       ymm3, xmm1
            vpmovzxbw       ymm0, xmm0
            vextracti32x4   xmm1, ymm1, 0x1
            vpmovzxdq       ymm2, xmm3
            vextracti32x4   xmm3, ymm3, 0x1
            vpmovzxwd       ymm4, xmm0
            vpmovzxdq       ymm3, xmm3
            vextracti32x4   xmm0, ymm0, 0x1
            vpmovzxwd       ymm1, xmm1
            vpmovzxwd       ymm0, xmm0
            vpaddq  ymm2, ymm2, ymm3
            vpmovzxdq       ymm3, xmm4
            vpaddq  ymm2, ymm2, ymm3
            vpmovzxdq       ymm3, xmm0
            vextracti32x4   xmm4, ymm4, 0x1
            vpaddq  ymm2, ymm2, ymm3
            vpmovzxdq       ymm3, xmm1
            vextracti32x4   xmm1, ymm1, 0x1
            vpmovzxdq       ymm1, xmm1
            vpmovzxdq       ymm4, xmm4
            vextracti32x4   xmm0, ymm0, 0x1
            vpaddq  ymm1, ymm3, ymm1
            vpmovzxdq       ymm0, xmm0
            vpaddq  ymm1, ymm1, ymm4
            vpaddq  ymm0, ymm1, ymm0
            vpaddq  ymm0, ymm2, ymm0
            vpaddq  ymm5, ymm5, ymm0
            cmp     rax, rsi
            jne     .L10
            vextracti64x2   xmm0, ymm5, 0x1
            vpaddq  xmm0, xmm0, xmm5
            vpsrldq xmm1, xmm0, 8
            vmovdqa xmm3, xmm0
            vpaddq  xmm1, xmm0, xmm1
            vmovq   rax, xmm1
            cmp     rcx, rdx
            je      .L22
            vzeroupper
    .L9:
            mov     rsi, rcx
            sub     rsi, rdx
            lea     r8, [rsi-1]
            cmp     r8, 14
            jbe     .L13
            mov     eax, 16843009
            mov     r8, rsi
            vpbroadcastd    xmm0, eax
            and     r8, -16
            vpternlogd      xmm0, xmm0, XMMWORD PTR [rdi+rdx], $0x44
            add     rdx, r8
            and     esi, 15
            vpmovzxbw       xmm2, xmm0
            vpsrldq xmm0, xmm0, 8
            vpmovzxwd       xmm4, xmm2
            vpsrldq xmm2, xmm2, 8
            vpmovzxbw       xmm0, xmm0
            vpmovzxdq       xmm1, xmm4
            vpsrldq xmm4, xmm4, 8
            vpmovzxwd       xmm5, xmm0
            vpmovzxdq       xmm4, xmm4
            vpsrldq xmm0, xmm0, 8
            vpmovzxwd       xmm2, xmm2
            vpaddq  xmm1, xmm1, xmm4
            vpsrldq xmm4, xmm5, 8
            vpmovzxwd       xmm0, xmm0
            vpmovzxdq       xmm4, xmm4
            vpmovzxdq       xmm5, xmm5
            vpaddq  xmm1, xmm1, xmm4
            vpmovzxdq       xmm4, xmm0
            vpaddq  xmm1, xmm1, xmm4
            vpsrldq xmm0, xmm0, 8
            vpaddq  xmm1, xmm1, xmm3
            vpmovzxdq       xmm3, xmm2
            vpmovzxdq       xmm0, xmm0
            vpsrldq xmm2, xmm2, 8
            vpmovzxdq       xmm2, xmm2
            vpaddq  xmm2, xmm3, xmm2
            vpaddq  xmm2, xmm2, xmm5
            vpaddq  xmm0, xmm2, xmm0
            vpaddq  xmm0, xmm1, xmm0
            vpsrldq xmm1, xmm0, 8
            vpaddq  xmm0, xmm0, xmm1
            vmovq   rax, xmm0
            je      .L7
    .L13:
            movzx   esi, BYTE PTR [rdi+rdx]
            not     esi
            and     esi, 1
            add     rax, rsi
            lea     rsi, [rdx+1]
            cmp     rsi, rcx
            jnb     .L7
            movzx   esi, BYTE PTR [rdi+1+rdx]
            not     esi
            and     esi, 1
            add     rax, rsi
            lea     rsi, [rdx+2]
            cmp     rsi, rcx
            jnb     .L7
            movzx   esi, BYTE PTR [rdi+2+rdx]
            not     esi
            and     esi, 1
            add     rax, rsi
            lea     rsi, [rdx+3]
            cmp     rsi, rcx
            jnb     .L7
            movzx   esi, BYTE PTR [rdi+3+rdx]
            not     esi
            and     esi, 1
            add     rax, rsi
            lea     rsi, [rdx+4]
            cmp     rsi, rcx
            jnb     .L7
            movzx   esi, BYTE PTR [rdi+4+rdx]
            not     esi
            and     esi, 1
            add     rax, rsi
            lea     rsi, [rdx+5]
            cmp     rsi, rcx
            jnb     .L7
            movzx   esi, BYTE PTR [rdi+5+rdx]
            not     esi
            and     esi, 1
            add     rax, rsi
            lea     rsi, [rdx+6]
            cmp     rsi, rcx
            jnb     .L7
            movzx   esi, BYTE PTR [rdi+6+rdx]
            not     esi
            and     esi, 1
            add     rax, rsi
            lea     rsi, [rdx+7]
            cmp     rsi, rcx
            jnb     .L7
            movzx   esi, BYTE PTR [rdi+7+rdx]
            not     esi
            and     esi, 1
            add     rax, rsi
            lea     rsi, [rdx+8]
            cmp     rsi, rcx
            jnb     .L7
            movzx   esi, BYTE PTR [rdi+8+rdx]
            not     esi
            and     esi, 1
            add     rax, rsi
            lea     rsi, [rdx+9]
            cmp     rsi, rcx
            jnb     .L7
            movzx   esi, BYTE PTR [rdi+9+rdx]
            not     esi
            and     esi, 1
            add     rax, rsi
            lea     rsi, [rdx+10]
            cmp     rsi, rcx
            jnb     .L7
            movzx   esi, BYTE PTR [rdi+10+rdx]
            not     esi
            and     esi, 1
            add     rax, rsi
            lea     rsi, [rdx+11]
            cmp     rsi, rcx
            jnb     .L7
            movzx   esi, BYTE PTR [rdi+11+rdx]
            not     esi
            and     esi, 1
            add     rax, rsi
            lea     rsi, [rdx+12]
            cmp     rsi, rcx
            jnb     .L7
            movzx   esi, BYTE PTR [rdi+12+rdx]
            not     esi
            and     esi, 1
            add     rax, rsi
            lea     rsi, [rdx+13]
            cmp     rsi, rcx
            jnb     .L7
            movzx   esi, BYTE PTR [rdi+13+rdx]
            not     esi
            and     esi, 1
            add     rax, rsi
            lea     rsi, [rdx+14]
            cmp     rsi, rcx
            jnb     .L7
            movzx   edx, BYTE PTR [rdi+14+rdx]
            not     edx
            and     edx, 1
            add     rax, rdx
            ret
    .L15:
            xor     eax, eax
    .L7:
            ret
    .L16:
            vpxor   xmm3, xmm3, xmm3
            xor     edx, edx
            xor     eax, eax
            jmp     .L9
    .L22:
            vzeroupper
            ret
    count_even_v3(unsigned char*, unsigned long):
            test    rsi, rsi
            je      .L26
            add     rsi, rdi
            xor     edx, edx
    .L25:
            movzx   eax, BYTE PTR [rdi]
            inc     rdi
            not     eax
            and     eax, 1
            add     rdx, rax
            cmp     rsi, rdi
            jne     .L25
            mov     rax, rdx
            ret
    .L26:
            xor     edx, edx
            mov     rax, rdx
            ret
    count_even_v4(unsigned char*, unsigned long):
            test    rsi, rsi
            je      .L31
            add     rsi, rdi
            xor     edx, edx
    .L30:
            movzx   eax, BYTE PTR [rdi]
            inc     rdi
            not     eax
            and     eax, 1
            add     rdx, rax
            cmp     rsi, rdi
            jne     .L30
            mov     rax, rdx
            ret
    .L31:
            xor     edx, edx
            mov     rax, rdx
            ret

So why is only count_even_v2() successfully vectorized? For example, shouldn't the compiler have an easy time seeing that:

    ret += is_even_bool(data[i]);

is essentially the same thing as:

    ret += is_even_u64(data[i]);

because is_even_bool(data[i]) will be converted to an u64 before doing the add-assignment? Clang generates identical code for all four versions and successfully vectorizes them.

Also, an interesting observation is that g++ won't fail to vectorize the same code, with the exception that the type of the accumulator is the same as the type of the data (i.e. we have "u32 *data" and "u32 ret", for example). In this case, all four versions are vectorized: https://godbolt.org/z/6WenP9Y3T

Note: the same things happen with g++ trunk
Comment 1 Andrew Pinski 2024-11-25 21:05:18 UTC
Confirmed.
Let me see if I can handle this.

The problem is we have this IR for count_even_v1, count_even_v3, and count_even_v4:

  unsigned char _12;
...
  _11 = (bool) _12;
  _3 = (long unsigned int) _11;


This could be done as:
  unsigned char _12;

_t = _12 & 1;
_3 = (long unsigned int) _t;

Which is the IR for count_even_v2.
Comment 2 Andrew Pinski 2024-11-25 21:07:07 UTC
>(i.e. we have "u32 *data" and "u32 ret", for example)
`
That is because we do handle `(same_type)((bool)a)` -> `a & 1` already.
Comment 3 Andrew Pinski 2024-11-25 21:52:25 UTC
Here is a reduced testcase for the missed optimization also dealing with Canonical form here.
```
static inline bool even (unsigned char t)
{
  return (t%2) == 0;
}
unsigned char f(unsigned char t)
{
  unsigned char y = even(t);
  unsigned int l = even(t);
  unsigned int l1 = y;
  return l1 == l;
}
```

`& 1` is the canonical form even though the cast might look better because logical operators are better at being handled than casts (especially casts to bool).
Comment 4 Andrew Pinski 2024-11-25 21:56:48 UTC
This is actually a regression in that 8.5.0 used to be able to vectorize this.
It can also worked around via using the option -fno-tree-vrp.
Comment 5 Andrew Pinski 2024-11-25 22:00:12 UTC
Note this is specificly related to the even odd/check rather than anything else.
Comment 6 Ionuț Nicula 2024-11-25 22:07:50 UTC
Good point, my previous title was too generic, while the actual problem seems to be very specific (i.e. related to the even/odd check, like you said).
Comment 7 Ionuț Nicula 2024-11-25 22:15:37 UTC
I see that if I replace the condition with `n%3==0`, or `n>3`, the code is vectorized. So GCC is clearly capable of vectorizing this `accumulator += boolean` pattern.
Comment 8 Andrew Pinski 2024-11-26 00:05:16 UTC
I have a patch but I need to write up a few testcases.

Basically I modified case for `(convert (convert` where the inner and outer types were nop conversion to allow for the outer conversion to be greater than the inner one. and then we canonicalize like I mentioned in comment #3.
Comment 9 Andrew Pinski 2024-11-27 00:19:08 UTC
Patch posted:
https://gcc.gnu.org/pipermail/gcc-patches/2024-November/670122.html
Comment 10 GCC Commits 2024-11-27 18:22:08 UTC
The trunk branch has been updated by Andrew Pinski <pinskia@gcc.gnu.org>:

https://gcc.gnu.org/g:bca515ff1893fe4ca1a9042364af3c43f93a397c

commit r15-5730-gbca515ff1893fe4ca1a9042364af3c43f93a397c
Author: Andrew Pinski <quic_apinski@quicinc.com>
Date:   Mon Nov 25 16:04:21 2024 -0800

    match: Improve handling of double convert [PR117776]
    
    For a double conversion, we will simplify it into a conversion
    with an and if the outer type and inside precision matches and
    the intra precision is smaller and unsigned. We should be able
    to extend this to where the outer precision is larger too.
    This is a good canonicalization too.
    
    Bootstrapped and tested on x86_64-linux-gnu.
    
            PR tree-optimization/117776
    gcc/ChangeLog:
    
            * match.pd (nested int casts): Allow for the case
            where the final prec is greater than the original
            prec.
    
    gcc/testsuite/ChangeLog:
    
            * g++.dg/vect/pr117776.cc: New test.
            * gcc.dg/tree-ssa/cast-3.c: New test.
    
    Signed-off-by: Andrew Pinski <quic_apinski@quicinc.com>
Comment 11 Andrew Pinski 2024-11-27 18:25:12 UTC
Fixed on the trunk, this patch is not worth backporting even though this is a regression; it is a missed optimization and one which one that happens only one specific condition (even/odd checks) and it took 5 years for someone to report.