Bug 96738 - GCC generates worse assembly than clang and It fails to vectorized code compared to clang
Summary: GCC generates worse assembly than clang and It fails to vectorized code compa...
Status: NEW
Alias: None
Product: gcc
Classification: Unclassified
Component: tree-optimization (show other bugs)
Version: 11.0
: P3 enhancement
Target Milestone: ---
Assignee: Not yet assigned to anyone
URL:
Keywords: missed-optimization
Depends on:
Blocks: vectorizer
  Show dependency treegraph
 
Reported: 2020-08-21 18:34 UTC by fdlbxtqi
Modified: 2020-08-25 11:36 UTC (History)
2 users (show)

See Also:
Host:
Target: x86_64-linux-gnu
Build:
Known to work:
Known to fail:
Last reconfirmed: 2020-08-25 00:00:00


Attachments

Note You need to log in before you can comment on or make changes to this bug.
Description fdlbxtqi 2020-08-21 18:34:27 UTC
https://godbolt.org/z/9K3369

#include<array>
#include<cstdint>

struct number
{
	std::array<std::uint64_t,5> num;


	inline constexpr std::uint64_t& operator[](std::size_t position) noexcept
	{
		return num[position];
	}
	inline constexpr std::uint64_t const& operator[](std::size_t position) const noexcept
	{
		return num[position];
	}
};


number add_reduce(number const& a,number const& b) noexcept
{
    constexpr auto reduce_mask_51{(static_cast<std::uint64_t>(1) << 51) - 1};
    number out;
	std::uint64_t c{(a[0] + b[0])>>51};
	out[1] = a[1] + b[1] + c; c = (out[1] >> 51); out[1] &= reduce_mask_51;
	out[2] = a[2] + b[2] + c; c = (out[2] >> 51); out[2] &= reduce_mask_51;
	out[3] = a[3] + b[3] + c; c = (out[3] >> 51); out[3] &= reduce_mask_51;
	out[4] = a[4] + b[4] + c; c = (out[4] >> 51); out[4] &= reduce_mask_51;
	out[0] = c * 19;
	return out;
}


gcc:

add_reduce(number const&, number const&):
        movq    (%rdx), %rax
        addq    (%rsi), %rax
        movq    %rdi, %r8
        movq    %rdx, %rdi
        shrq    $51, %rax
        movq    8(%rdx), %rdx
        addq    8(%rsi), %rdx
        movq    %rsi, %rcx
        movabsq $2251799813685247, %rsi
        addq    %rdx, %rax
        movq    %rax, %rdx
        shrq    $51, %rax
        andq    %rsi, %rdx
        movq    %rdx, 8(%r8)
        movq    16(%rdi), %rdx
        addq    16(%rcx), %rdx
        addq    %rdx, %rax
        movq    %rax, %rdx
        shrq    $51, %rax
        andq    %rsi, %rdx
        movq    %rdx, 16(%r8)
        movq    24(%rdi), %rdx
        addq    24(%rcx), %rdx
        addq    %rax, %rdx
        movq    %rdx, %rax
        shrq    $51, %rdx
        andq    %rsi, %rax
        movq    %rax, 24(%r8)
        movq    32(%rdi), %rax
        addq    32(%rcx), %rax
        addq    %rdx, %rax
        andq    %rax, %rsi
        shrq    $51, %rax
        leaq    (%rax,%rax,8), %rdx
        movq    %rsi, 32(%r8)
        leaq    (%rax,%rdx,2), %rax
        movq    %rax, (%r8)
        movq    %r8, %rax
        ret

clang:
add_reduce(number const&, number const&):             # @add_reduce(number const&, number const&)
        movq    %rdi, %rax
        movq    (%rdx), %rcx
        movq    8(%rdx), %rdi
        addq    (%rsi), %rcx
        shrq    $51, %rcx
        addq    8(%rsi), %rdi
        addq    %rcx, %rdi
        movq    %rdi, %rcx
        shrq    $51, %rcx
        movabsq $2251799813685247, %r8          # imm = 0x7FFFFFFFFFFFF
        andq    %r8, %rdi
        movq    %rdi, 8(%rax)
        movq    16(%rdx), %rdi
        addq    16(%rsi), %rdi
        addq    %rcx, %rdi
        movq    %rdi, %rcx
        shrq    $51, %rcx
        andq    %r8, %rdi
        movq    %rdi, 16(%rax)
        movq    24(%rdx), %rdi
        addq    24(%rsi), %rdi
        addq    %rcx, %rdi
        movq    %rdi, %rcx
        andq    %r8, %rdi
        movq    %rdi, 24(%rax)
        movq    32(%rdx), %rdx
        addq    32(%rsi), %rdx
        shrq    $51, %rcx
        addq    %rcx, %rdx
        movq    %rdx, %rcx
        shrq    $51, %rcx
        andq    %r8, %rdx
        movq    %rdx, 32(%rax)
        leaq    (%rcx,%rcx,8), %rdx
        leaq    (%rcx,%rdx,2), %rcx
        movq    %rcx, (%rax)
        retq

clang with -march=native

.LCPI0_0:
        .quad   2251799813685247
add_reduce(number const&, number const&):             # @add_reduce(number const&, number const&)
        movq    %rdi, %rax
        movq    (%rdx), %rcx
        movq    8(%rdx), %rdi
        addq    (%rsi), %rcx
        shrq    $51, %rcx
        addq    8(%rsi), %rdi
        addq    %rcx, %rdi
        vmovq   %rdi, %xmm0
        shrq    $51, %rdi
        movq    16(%rdx), %rcx
        addq    16(%rsi), %rcx
        addq    %rdi, %rcx
        vmovq   %rcx, %xmm1
        shrq    $51, %rcx
        movq    24(%rdx), %rdi
        addq    24(%rsi), %rdi
        addq    %rcx, %rdi
        vmovq   %rdi, %xmm2
        shrq    $51, %rdi
        movq    32(%rdx), %rcx
        addq    32(%rsi), %rcx
        addq    %rdi, %rcx
        vpunpcklqdq     %xmm1, %xmm0, %xmm0     # xmm0 = xmm0[0],xmm1[0]
        vmovq   %rcx, %xmm1
        vpunpcklqdq     %xmm1, %xmm2, %xmm1     # xmm1 = xmm2[0],xmm1[0]
        vinserti128     $1, %xmm1, %ymm0, %ymm0
        vpandq  .LCPI0_0(%rip){1to4}, %ymm0, %ymm0
        shrq    $51, %rcx
        vmovdqu %ymm0, 8(%rax)
        leaq    (%rcx,%rcx,8), %rdx
        leaq    (%rcx,%rdx,2), %rcx
        movq    %rcx, (%rax)
        vzeroupper
        retq
Comment 1 Richard Biener 2020-08-25 11:36:12 UTC
GCC fails to SLP vectorize this because SLP discovery fails for this reduction
scheme.  The only think clang vectorizes seems to be the mask reduction
and the store.

We're not falling back to scalar operand construction for the mask and
for some reason.