[Bug tree-optimization/102974] GCC optimization is very poor for add carry and multiplication combos

pinskia at gcc dot gnu.org gcc-bugzilla@gcc.gnu.org
Wed Oct 27 21:18:16 GMT 2021


https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102974

--- Comment #1 from Andrew Pinski <pinskia at gcc dot gnu.org> ---
#include<cstdint>
#include<bit>

struct ul32x2
{
    std::uint_least32_t low,high;
};

inline constexpr std::uint_least32_t umul_least_32(std::uint_least32_t
a,std::uint_least32_t b,std::uint_least32_t& high) noexcept
{
        if
constexpr(std::endian::native==std::endian::little||std::endian::native==std::endian::big)
        {
                struct ul32x2_little_endian_t
                {
                        std::uint_least32_t low,high;
                };
                struct ul32x2_big_endian_t
                {
                        std::uint_least32_t high,low;
                };
                using ul32x2_t =
std::conditional_t<std::endian::native==std::endian::little,ul32x2_little_endian_t,ul32x2_big_endian_t>;
                auto
ret{__builtin_bit_cast(ul32x2_t,static_cast<std::uint_least64_t>(a)*b)};
                high=ret.high;
                return ret.low;
        }
        else
        {
                std::uint_least64_t v{static_cast<std::uint_least64_t>(a)*b};
                high=static_cast<std::uint_least32_t>(v>>32u);
                return static_cast<std::uint_least32_t>(v);
        }
}
template<typename T>
#if __cpp_lib_concepts >= 202002L
requires (std::unsigned_integral<T>)
#endif
inline constexpr bool add_carry_naive(bool carry,T a,T b,T& out) noexcept
{
        T temp{carry+a};
        out=temp+b;
        return (out < b) | (temp < a);
}

template<typename T>
#if __cpp_lib_concepts >= 202002L
requires (std::unsigned_integral<T>)
#endif
inline constexpr bool add_carry(bool carry,T a,T b,T& out) noexcept
{
#if __cpp_lib_is_constant_evaluated >= 201811L
        if(std::is_constant_evaluated())
                return add_carry_naive(carry,a,b,out);
        else
#endif
        {
#if defined(_MSC_VER) && !defined(__clang__)
#if (defined(_M_IX86) || defined(_M_AMD64))
        if constexpr(sizeof(T)==8)
        {
#if defined(_M_AMD64)
                return
_addcarryx_u64(carry,a,b,reinterpret_cast<std::uint64_t*>(__builtin_addressof(out)));
#else
                return _addcarryx_u32(_addcarryx_u32(carry,
               
*reinterpret_cast<std::uint32_t*>(__builtin_addressof(a)),*reinterpret_cast<std::uint32_t*>(__builtin_addressof(b)),reinterpret_cast<std::uint32_t*>(__builtin_addressof(out))),
               
reinterpret_cast<std::uint32_t*>(__builtin_addressof(a))[1],reinterpret_cast<std::uint32_t*>(__builtin_addressof(b))[1],reinterpret_cast<std::uint32_t*>(__builtin_addressof(out))+1);
#endif
        }
        else if constexpr(sizeof(T)==4)
                return
_addcarryx_u32(carry,a,b,reinterpret_cast<std::uint32_t*>(__builtin_addressof(out)));
        else if constexpr(sizeof(T)==2)
                return _addcarry_u16(carry,a,b,reinterpret_cast<short
unsigned*>(__builtin_addressof(out)));
        else if constexpr(sizeof(T)==1)
                return _addcarry_u8(carry,a,b,reinterpret_cast<char
unsigned*>(__builtin_addressof(out)));
        else
                return add_carry_naive(carry,a,b,out);
#else
                return add_carry_naive(carry,a,b,out);
#endif
#elif defined(__has_builtin) &&
(__has_builtin(__builtin_addcb)&&__has_builtin(__builtin_addcs)&&__has_builtin(__builtin_addc)&&__has_builtin(__builtin_addcl)&&__has_builtin(__builtin_addcll))
        if constexpr(sizeof(T)==sizeof(long long unsigned))
        {
                long long unsigned carryout;
                out=__builtin_addcll(a,b,carry,__builtin_addressof(carryout));
                return carryout;
        }
        else if constexpr(sizeof(T)==sizeof(long unsigned))
        {
                long unsigned carryout;
                out=__builtin_addcl(a,b,carry,__builtin_addressof(carryout));
                return carryout;
        }
        else if constexpr(sizeof(T)==sizeof(unsigned))
        {
                unsigned carryout;
                out=__builtin_addc(a,b,carry,__builtin_addressof(carryout));
                return carryout;
        }
        else if constexpr(sizeof(T)==sizeof(short unsigned))
        {
                short unsigned carryout;
                out=__builtin_addcs(a,b,carry,__builtin_addressof(carryout));
                return carryout;
        }
        else if constexpr(sizeof(T)==sizeof(char unsigned))
        {
                char unsigned carryout;
                out=__builtin_addcb(a,b,carry,__builtin_addressof(carryout));
                return carryout;
        }
        else
        {
                return add_carry_naive(carry,a,b,out);
        }
#elif defined(__has_builtin) &&
(__has_builtin(__builtin_ia32_addcarryx_u32)||__has_builtin(__builtin_ia32_addcarry_u32)||__has_builtin(__builtin_ia32_addcarryx_u64))
        if constexpr(sizeof(T)==8)
        {
#if __has_builtin(__builtin_ia32_addcarryx_u64)
                using may_alias_ptr_type [[gnu::may_alias]] = unsigned long
long*;
                return
__builtin_ia32_addcarryx_u64(carry,a,b,reinterpret_cast<may_alias_ptr_type>(__builtin_addressof(out)));
#else
                std::uint32_t a_low;
                std::uint32_t a_high;
               
__builtin_memcpy(__builtin_addressof(a_low),__builtin_addressof(a),4);
               
__builtin_memcpy(__builtin_addressof(a_high),reinterpret_cast<char
const*>(__builtin_addressof(a))+4,4);
                std::uint32_t b_low;
                std::uint32_t b_high;
               
__builtin_memcpy(__builtin_addressof(b_low),__builtin_addressof(b),4);
               
__builtin_memcpy(__builtin_addressof(b_high),reinterpret_cast<char
const*>(__builtin_addressof(b))+4,4);
                using may_alias_ptr_type [[gnu::may_alias]] = unsigned*;
        #if __has_builtin(__builtin_ia32_addcarry_u32)
                return
__builtin_ia32_addcarry_u32(__builtin_ia32_addcarry_u32(carry,a_low,b_low,reinterpret_cast<may_alias_ptr_type>(__builtin_addressof(out))),
               
a_high,b_high,reinterpret_cast<may_alias_ptr_type>(__builtin_addressof(out))+1);
        #elif __has_builtin(__builtin_ia32_addcarryx_u32)
                return
__builtin_ia32_addcarryx_u32(__builtin_ia32_addcarryx_u32(carry,a_low,b_low,reinterpret_cast<may_alias_ptr_type>(__builtin_addressof(out))),
               
a_high,b_high,reinterpret_cast<may_alias_ptr_type>(__builtin_addressof(out))+1);
        #else
                return add_carry_naive(carry,a,b,out);
        #endif
#endif
        }
        else if constexpr(sizeof(T)==4)
        {
                using may_alias_ptr_type [[gnu::may_alias]] = unsigned*;
#if __has_builtin(__builtin_ia32_addcarry_u32)
                return
__builtin_ia32_addcarry_u32(carry,a,b,reinterpret_cast<may_alias_ptr_type>(__builtin_addressof(out)));
#elif __has_builtin(__builtin_ia32_addcarryx_u32)
                return
__builtin_ia32_addcarryx_u32(carry,a,b,reinterpret_cast<may_alias_ptr_type>(__builtin_addressof(out)));
#else
                return add_carry_naive(carry,a,b,out);
#endif
        }
        else
                return add_carry_naive(carry,a,b,out);  //16 bit addcarry
simply does not exist on gcc and clang
#else
        return add_carry_naive(carry,a,b,out);
#endif
        }
}
std::uint_least64_t umul_least_64(std::uint_least64_t a,std::uint_least64_t
b,std::uint_least64_t& high) noexcept
{
    auto [a0,a1]=__builtin_bit_cast(ul32x2,a);
    auto [b0,b1]=__builtin_bit_cast(ul32x2,b);
    std::uint_least32_t c1;
    std::uint_least32_t c0{umul_least_32(a0,b0,c1)};
    std::uint_least32_t a0b1h;
    std::uint_least32_t a0b1l{umul_least_32(a0,b1,a0b1h)};
    std::uint_least32_t a1b0h;
    std::uint_least32_t a1b0l{umul_least_32(a1,b0,a1b0h)};
    std::uint_least32_t c3;
    std::uint_least32_t c2{umul_least_32(a1,b1,c3)};
    bool carry{add_carry(false,c1,a0b1l,c1)};
    carry=add_carry(carry,a0b1h,c2,c2);
    std::uint_least32_t temp{carry};
    carry=add_carry(false,c1,a1b0l,c1);
    carry=add_carry(carry,a1b0h,c2,c2);
    add_carry(carry,temp,c3,c3);
    high=__builtin_bit_cast(std::uint_least64_t,ul32x2{c2,c3});
    return __builtin_bit_cast(std::uint_least64_t,ul32x2{c0,c1});
}


More information about the Gcc-bugs mailing list