[Bug tree-optimization/102974] GCC optimization is very poor for add carry and multiplication combos
pinskia at gcc dot gnu.org
gcc-bugzilla@gcc.gnu.org
Wed Oct 27 21:18:16 GMT 2021
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102974
--- Comment #1 from Andrew Pinski <pinskia at gcc dot gnu.org> ---
#include<cstdint>
#include<bit>
struct ul32x2
{
std::uint_least32_t low,high;
};
inline constexpr std::uint_least32_t umul_least_32(std::uint_least32_t
a,std::uint_least32_t b,std::uint_least32_t& high) noexcept
{
if
constexpr(std::endian::native==std::endian::little||std::endian::native==std::endian::big)
{
struct ul32x2_little_endian_t
{
std::uint_least32_t low,high;
};
struct ul32x2_big_endian_t
{
std::uint_least32_t high,low;
};
using ul32x2_t =
std::conditional_t<std::endian::native==std::endian::little,ul32x2_little_endian_t,ul32x2_big_endian_t>;
auto
ret{__builtin_bit_cast(ul32x2_t,static_cast<std::uint_least64_t>(a)*b)};
high=ret.high;
return ret.low;
}
else
{
std::uint_least64_t v{static_cast<std::uint_least64_t>(a)*b};
high=static_cast<std::uint_least32_t>(v>>32u);
return static_cast<std::uint_least32_t>(v);
}
}
template<typename T>
#if __cpp_lib_concepts >= 202002L
requires (std::unsigned_integral<T>)
#endif
inline constexpr bool add_carry_naive(bool carry,T a,T b,T& out) noexcept
{
T temp{carry+a};
out=temp+b;
return (out < b) | (temp < a);
}
template<typename T>
#if __cpp_lib_concepts >= 202002L
requires (std::unsigned_integral<T>)
#endif
inline constexpr bool add_carry(bool carry,T a,T b,T& out) noexcept
{
#if __cpp_lib_is_constant_evaluated >= 201811L
if(std::is_constant_evaluated())
return add_carry_naive(carry,a,b,out);
else
#endif
{
#if defined(_MSC_VER) && !defined(__clang__)
#if (defined(_M_IX86) || defined(_M_AMD64))
if constexpr(sizeof(T)==8)
{
#if defined(_M_AMD64)
return
_addcarryx_u64(carry,a,b,reinterpret_cast<std::uint64_t*>(__builtin_addressof(out)));
#else
return _addcarryx_u32(_addcarryx_u32(carry,
*reinterpret_cast<std::uint32_t*>(__builtin_addressof(a)),*reinterpret_cast<std::uint32_t*>(__builtin_addressof(b)),reinterpret_cast<std::uint32_t*>(__builtin_addressof(out))),
reinterpret_cast<std::uint32_t*>(__builtin_addressof(a))[1],reinterpret_cast<std::uint32_t*>(__builtin_addressof(b))[1],reinterpret_cast<std::uint32_t*>(__builtin_addressof(out))+1);
#endif
}
else if constexpr(sizeof(T)==4)
return
_addcarryx_u32(carry,a,b,reinterpret_cast<std::uint32_t*>(__builtin_addressof(out)));
else if constexpr(sizeof(T)==2)
return _addcarry_u16(carry,a,b,reinterpret_cast<short
unsigned*>(__builtin_addressof(out)));
else if constexpr(sizeof(T)==1)
return _addcarry_u8(carry,a,b,reinterpret_cast<char
unsigned*>(__builtin_addressof(out)));
else
return add_carry_naive(carry,a,b,out);
#else
return add_carry_naive(carry,a,b,out);
#endif
#elif defined(__has_builtin) &&
(__has_builtin(__builtin_addcb)&&__has_builtin(__builtin_addcs)&&__has_builtin(__builtin_addc)&&__has_builtin(__builtin_addcl)&&__has_builtin(__builtin_addcll))
if constexpr(sizeof(T)==sizeof(long long unsigned))
{
long long unsigned carryout;
out=__builtin_addcll(a,b,carry,__builtin_addressof(carryout));
return carryout;
}
else if constexpr(sizeof(T)==sizeof(long unsigned))
{
long unsigned carryout;
out=__builtin_addcl(a,b,carry,__builtin_addressof(carryout));
return carryout;
}
else if constexpr(sizeof(T)==sizeof(unsigned))
{
unsigned carryout;
out=__builtin_addc(a,b,carry,__builtin_addressof(carryout));
return carryout;
}
else if constexpr(sizeof(T)==sizeof(short unsigned))
{
short unsigned carryout;
out=__builtin_addcs(a,b,carry,__builtin_addressof(carryout));
return carryout;
}
else if constexpr(sizeof(T)==sizeof(char unsigned))
{
char unsigned carryout;
out=__builtin_addcb(a,b,carry,__builtin_addressof(carryout));
return carryout;
}
else
{
return add_carry_naive(carry,a,b,out);
}
#elif defined(__has_builtin) &&
(__has_builtin(__builtin_ia32_addcarryx_u32)||__has_builtin(__builtin_ia32_addcarry_u32)||__has_builtin(__builtin_ia32_addcarryx_u64))
if constexpr(sizeof(T)==8)
{
#if __has_builtin(__builtin_ia32_addcarryx_u64)
using may_alias_ptr_type [[gnu::may_alias]] = unsigned long
long*;
return
__builtin_ia32_addcarryx_u64(carry,a,b,reinterpret_cast<may_alias_ptr_type>(__builtin_addressof(out)));
#else
std::uint32_t a_low;
std::uint32_t a_high;
__builtin_memcpy(__builtin_addressof(a_low),__builtin_addressof(a),4);
__builtin_memcpy(__builtin_addressof(a_high),reinterpret_cast<char
const*>(__builtin_addressof(a))+4,4);
std::uint32_t b_low;
std::uint32_t b_high;
__builtin_memcpy(__builtin_addressof(b_low),__builtin_addressof(b),4);
__builtin_memcpy(__builtin_addressof(b_high),reinterpret_cast<char
const*>(__builtin_addressof(b))+4,4);
using may_alias_ptr_type [[gnu::may_alias]] = unsigned*;
#if __has_builtin(__builtin_ia32_addcarry_u32)
return
__builtin_ia32_addcarry_u32(__builtin_ia32_addcarry_u32(carry,a_low,b_low,reinterpret_cast<may_alias_ptr_type>(__builtin_addressof(out))),
a_high,b_high,reinterpret_cast<may_alias_ptr_type>(__builtin_addressof(out))+1);
#elif __has_builtin(__builtin_ia32_addcarryx_u32)
return
__builtin_ia32_addcarryx_u32(__builtin_ia32_addcarryx_u32(carry,a_low,b_low,reinterpret_cast<may_alias_ptr_type>(__builtin_addressof(out))),
a_high,b_high,reinterpret_cast<may_alias_ptr_type>(__builtin_addressof(out))+1);
#else
return add_carry_naive(carry,a,b,out);
#endif
#endif
}
else if constexpr(sizeof(T)==4)
{
using may_alias_ptr_type [[gnu::may_alias]] = unsigned*;
#if __has_builtin(__builtin_ia32_addcarry_u32)
return
__builtin_ia32_addcarry_u32(carry,a,b,reinterpret_cast<may_alias_ptr_type>(__builtin_addressof(out)));
#elif __has_builtin(__builtin_ia32_addcarryx_u32)
return
__builtin_ia32_addcarryx_u32(carry,a,b,reinterpret_cast<may_alias_ptr_type>(__builtin_addressof(out)));
#else
return add_carry_naive(carry,a,b,out);
#endif
}
else
return add_carry_naive(carry,a,b,out); //16 bit addcarry
simply does not exist on gcc and clang
#else
return add_carry_naive(carry,a,b,out);
#endif
}
}
std::uint_least64_t umul_least_64(std::uint_least64_t a,std::uint_least64_t
b,std::uint_least64_t& high) noexcept
{
auto [a0,a1]=__builtin_bit_cast(ul32x2,a);
auto [b0,b1]=__builtin_bit_cast(ul32x2,b);
std::uint_least32_t c1;
std::uint_least32_t c0{umul_least_32(a0,b0,c1)};
std::uint_least32_t a0b1h;
std::uint_least32_t a0b1l{umul_least_32(a0,b1,a0b1h)};
std::uint_least32_t a1b0h;
std::uint_least32_t a1b0l{umul_least_32(a1,b0,a1b0h)};
std::uint_least32_t c3;
std::uint_least32_t c2{umul_least_32(a1,b1,c3)};
bool carry{add_carry(false,c1,a0b1l,c1)};
carry=add_carry(carry,a0b1h,c2,c2);
std::uint_least32_t temp{carry};
carry=add_carry(false,c1,a1b0l,c1);
carry=add_carry(carry,a1b0h,c2,c2);
add_carry(carry,temp,c3,c3);
high=__builtin_bit_cast(std::uint_least64_t,ul32x2{c2,c3});
return __builtin_bit_cast(std::uint_least64_t,ul32x2{c0,c1});
}
More information about the Gcc-bugs
mailing list