[Bug rtl-optimization/97437] New: builtins subcarry and addcarry still not generate the right code. Not get optimized to immediate value
euloanty at live dot com
gcc-bugzilla@gcc.gnu.org
Thu Oct 15 08:28:08 GMT 2020
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97437
Bug ID: 97437
Summary: builtins subcarry and addcarry still not generate the
right code. Not get optimized to immediate value
Product: gcc
Version: 11.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: rtl-optimization
Assignee: unassigned at gcc dot gnu.org
Reporter: euloanty at live dot com
Target Milestone: ---
#include<cstdint>
#include<array>
#if defined(_MSC_VER)
#include<intrin.h>
#elif defined(__x86_64__) || defined(__i386__)
#include<immintrin.h>
#endif
struct field_number
{
using value_type =
std::conditional_t<sizeof(std::size_t)>=8,std::uint64_t,std::uint32_t>;
value_type content[32/sizeof(value_type)];
inline constexpr value_type const& operator[](std::size_t pos) const
noexcept
{
return content[pos];
}
inline constexpr value_type& operator[](std::size_t pos) noexcept
{
return content[pos];
}
};
namespace intrinsics
{
template<typename T>
#if __cpp_lib_concepts >= 202002L
requires (std::unsigned_integral<T>)
#endif
inline constexpr bool sub_borrow(bool borrow,T a,T b,T& out) noexcept
{
#if defined(_MSC_VER) || defined(__x86_64__) || defined(__i386__)
#if __cpp_lib_is_constant_evaluated >= 201811L
if(std::is_constant_evaluated())
return (out=a-b-borrow)>=a;
else
#endif
{
if constexpr(sizeof(T)==8)
#if defined(__x86_64__)
return _subborrow_u64(borrow,a,b,
#if !defined(__INTEL_COMPILER ) &&(defined(__GNUC__) || defined(__clang__))
reinterpret_cast<unsigned long long*>(&out));
#else
&out);
#endif
#else
return (out=a-b-borrow)>=a;
#endif
if constexpr(sizeof(T)==4)
return
_subborrow_u32(borrow,a,b,reinterpret_cast<std::uint32_t*>(&out));
else if constexpr(sizeof(T)==2)
return
_subborrow_u16(borrow,a,b,reinterpret_cast<std::uint16_t*>(&out));
else if constexpr(sizeof(T)==1)
return
_subborrow_u8(borrow,a,b,reinterpret_cast<std::uint8_t*>(&out));
}
#else
return (out=a-b-borrow)>=a;
#endif
}
}
field_number operator-(field_number const& x,field_number const& y) noexcept
{
using namespace intrinsics;
using unsigned_type = field_number::value_type;
constexpr unsigned_type zero{};
field_number f;
bool borrow{sub_borrow(false,x[0],y[0],f[0])};
borrow=sub_borrow(borrow,x[1],y[1],f[1]);
borrow=sub_borrow(borrow,x[2],y[2],f[2]);
borrow=sub_borrow(borrow,x[3],y[3],f[3]);
unsigned_type v{};
sub_borrow(borrow,v,v,v);
v&=static_cast<unsigned_type>(38);
borrow=sub_borrow(false,f[0],v,f[0]);
borrow=sub_borrow(borrow,f[1],zero,f[1]);
borrow=sub_borrow(borrow,f[2],zero,f[2]);
borrow=sub_borrow(borrow,f[3],zero,f[3]);
sub_borrow(borrow,v,v,v);
v&=static_cast<unsigned_type>(38);
borrow=sub_borrow(false,f[0],v,f[0]);
borrow=sub_borrow(borrow,f[1],zero,f[1]);
borrow=sub_borrow(borrow,f[2],zero,f[2]);
borrow=sub_borrow(borrow,f[2],zero,f[3]);
return f;
}
https://godbolt.org/z/xM8xef
operator-(field_number const&, field_number const&):
movq (%rsi), %r9
subq (%rdx), %r9
movq %rdi, %r8
movq %rdx, %rax
movq %r9, (%rdi)
movq 8(%rsi), %rdi
sbbq 8(%rdx), %rdi
movq %rdi, 8(%r8)
movq 16(%rsi), %rdx
sbbq 16(%rax), %rdx
movq %rdx, 16(%r8)
movq 24(%rax), %rax
movq 24(%rsi), %rsi
sbbq %rax, %rsi
//Here is an output dependency. No need movl 0 to %eax.
movl $0, %eax
movq %rax, %rcx
sbbq %rax, %rcx
andl $38, %ecx
subq %rcx, %r9
sbbq %rax, %rdi// why sbbq %rax,%rdi instead of sbbq 0 %rdi ????
//The %rax register should not get allocated or used in GCC
sbbq %rax, %rdx
sbbq %rax, %rsi
sbbq %rcx, %rcx
andl $38, %ecx
subq %rcx, %r9
sbbq %rax, %rdi
movq %r9, (%r8)
sbbq %rax, %rdx
movq %rdi, 8(%r8)
movq %rdx, 16(%r8)
sbbq %rax, %rdx
movq %r8, %rax
movq %rdx, 24(%r8)
ret
The assembly GCC generated is still worse than clang. although clang does not
generate the optimal one either.
The subborrow instruction in GCC does not get optimized as immediate value
The "correct" assembly it generates should be like what clang generates (you
can use different registers no problem) minus that xorl %ecx, %ecx clean up
instruction.
operator-(field_number const&, field_number const&): #
@operator-(field_number const&, field_number const&)
movq %rdi, %rax
movq (%rsi), %r8
subq (%rdx), %r8
movq 8(%rsi), %r9
sbbq 8(%rdx), %r9
movq 16(%rsi), %rdi
sbbq 16(%rdx), %rdi
movq 24(%rsi), %rsi
sbbq 24(%rdx), %rsi
sbbq %rcx, %rcx
andl $38, %ecx
subq %rcx, %r8
sbbq $0, %r9
sbbq $0, %rdi
sbbq $0, %rsi
sbbq %rcx, %rcx
andl $38, %ecx
subq %rcx, %r8
sbbq $0, %r9
movq %r8, (%rax)
movq %r9, 8(%rax)
sbbq $0, %rdi
movq %rdi, 16(%rax)
sbbq $0, %rdi
movq %rdi, 24(%rax)
retq
More information about the Gcc-bugs
mailing list